diff --git a/.gitattributes b/.gitattributes index 6b281f33f737db9ae661e80cfa66c5c778b11cb6..aced01d485c181e30ac1c813e54111005e2ad143 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,10 @@ +# Checkout as native, commit as LF except in specific circumstances +* text=auto +*.bat text eol=crlf +*.rc text eol=crlf +*.sln text eol=crlf +*.natvis text eol=crlf + libcxx/src/**/*.cpp merge=libcxx-reformat libcxx/include/**/*.h merge=libcxx-reformat diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h index e5b057ea1e42b7c3b0e26daf90cc00fb3fe439d4..d1acba0f26c78f226bf65b502d2e00afec5b110e 100644 --- a/bolt/include/bolt/Core/DIEBuilder.h +++ b/bolt/include/bolt/Core/DIEBuilder.h @@ -314,7 +314,7 @@ public: BC.errs() << "BOLT-ERROR: unable to find TypeUnit for Type Unit at offset 0x" - << DU.getOffset() << "\n"; + << Twine::utohexstr(DU.getOffset()) << "\n"; return nullptr; } diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 1347047e1b70604f35d7d806745abfff87144fb3..f246750209d6c4e6867aadb302975a785a12e263 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1294,8 +1294,8 @@ bool BinaryContext::handleAArch64Veneer(uint64_t Address, bool MatchOnly) { Veneer->getOrCreateLocalLabel(Address); Veneer->setMaxSize(TotalSize); Veneer->updateState(BinaryFunction::State::Disassembled); - LLVM_DEBUG(dbgs() << "BOLT-DEBUG: handling veneer function at 0x" << Address - << "\n"); + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: handling veneer function at 0x" + << Twine::utohexstr(Address) << "\n"); return true; }; diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index c483f70a836ee12f4e80eb228e6457087c442c8d..0b2d00300f46b9962f7a892cdd18cfaf8604ec6d 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -324,9 +324,8 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart( uint64_t LongJmpPass::tentativeLayoutRelocMode( const BinaryContext &BC, std::vector &SortedFunctions, uint64_t DotAddress) { - // Compute hot cold frontier - uint32_t LastHotIndex = -1u; + int64_t LastHotIndex = -1u; uint32_t CurrentIndex = 0; if (opts::HotFunctionsAtEnd) { for (BinaryFunction *BF : SortedFunctions) { @@ -351,19 +350,20 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode( // Hot CurrentIndex = 0; bool ColdLayoutDone = false; + auto runColdLayout = [&]() { + DotAddress = tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress); + ColdLayoutDone = true; + if (opts::HotFunctionsAtEnd) + DotAddress = alignTo(DotAddress, opts::AlignText); + }; for (BinaryFunction *Func : SortedFunctions) { if (!BC.shouldEmit(*Func)) { HotAddresses[Func] = Func->getAddress(); continue; } - if (!ColdLayoutDone && CurrentIndex >= LastHotIndex) { - DotAddress = - tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress); - ColdLayoutDone = true; - if (opts::HotFunctionsAtEnd) - DotAddress = alignTo(DotAddress, opts::AlignText); - } + if (!ColdLayoutDone && CurrentIndex >= LastHotIndex) + runColdLayout(); DotAddress = alignTo(DotAddress, Func->getMinAlignment()); uint64_t Pad = @@ -382,6 +382,11 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode( DotAddress += Func->estimateConstantIslandSize(); ++CurrentIndex; } + + // Ensure that tentative code layout always runs for cold blocks. + if (!ColdLayoutDone) + runColdLayout(); + // BBs for (BinaryFunction *Func : SortedFunctions) tentativeBBLayout(*Func); diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index f9cb1b3895e79bbdbaeb79716f19ed643c17c746..1b5ba8b49d36377ac1420f7fa870ffe50179850f 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -1362,7 +1362,7 @@ void DWARFRewriter::updateDWARFObjectAddressRanges( Die.getTag() == dwarf::DW_TAG_compile_unit)) { if (opts::Verbosity >= 1) errs() << "BOLT-WARNING: cannot update ranges for DIE in Unit offset 0x" - << Unit.getOffset() << '\n'; + << Twine::utohexstr(Unit.getOffset()) << '\n'; } } diff --git a/bolt/test/AArch64/split-funcs-lite.s b/bolt/test/AArch64/split-funcs-lite.s new file mode 100644 index 0000000000000000000000000000000000000000..5f95eea17ae75fdfea20f707824a22bfc819e7c0 --- /dev/null +++ b/bolt/test/AArch64/split-funcs-lite.s @@ -0,0 +1,27 @@ +# This test checks that tentative code layout for cold blocks always runs. +# It commonly happens when using lite mode with split functions. + +# REQUIRES: system-linux, asserts + +# RUN: %clang %cflags -o %t %s +# RUN: %clang %s %cflags -Wl,-q -o %t +# RUN: link_fdata --no-lbr %s %t %t.fdata +# RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions \ +# RUN: -debug 2>&1 | FileCheck %s + + .text + .globl foo + .type foo, %function +foo: +.entry_bb: +# FDATA: 1 foo #.entry_bb# 10 + cmp x0, #0 + b.eq .Lcold_bb1 + ret +.Lcold_bb1: + ret + +## Force relocation mode. +.reloc 0, R_AARCH64_NONE + +# CHECK: foo{{.*}} cold tentative: {{.*}} diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp index 62f9d19b2a362fbfee4f507d32c8b0d09f161a63..c4cac7d27b77c2a62146f4c19c5fd05fdc2b9062 100644 --- a/clang-tools-extra/clang-tidy/ClangTidy.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp @@ -458,7 +458,6 @@ ClangTidyASTConsumerFactory::createASTConsumer( if (!AnalyzerOptions.CheckersAndPackages.empty()) { setStaticAnalyzerCheckerOpts(Context.getOptions(), AnalyzerOptions); AnalyzerOptions.AnalysisDiagOpt = PD_NONE; - AnalyzerOptions.eagerlyAssumeBinOpBifurcation = true; std::unique_ptr AnalysisConsumer = ento::CreateAnalysisConsumer(Compiler); AnalysisConsumer->AddDiagnosticConsumer( diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp index c086e7721e02bda51f680a306be2270066a27bf1..d900978f65a944f65915317f30173ade258f613f 100644 --- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp @@ -8,12 +8,15 @@ #include "UseInternalLinkageCheck.h" #include "../utils/FileExtensionsUtils.h" +#include "../utils/LexerUtils.h" #include "clang/AST/Decl.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" #include "clang/ASTMatchers/ASTMatchersMacros.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Specifiers.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Token.h" #include "llvm/ADT/STLExtras.h" using namespace clang::ast_matchers; @@ -113,7 +116,7 @@ static constexpr StringRef Message = void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *FD = Result.Nodes.getNodeAs("fn")) { DiagnosticBuilder DB = diag(FD->getLocation(), Message) << "function" << FD; - SourceLocation FixLoc = FD->getTypeSpecStartLoc(); + const SourceLocation FixLoc = FD->getInnerLocStart(); if (FixLoc.isInvalid() || FixLoc.isMacroID()) return; if (FixMode == FixModeKind::UseStatic) @@ -128,7 +131,7 @@ void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) { return; DiagnosticBuilder DB = diag(VD->getLocation(), Message) << "variable" << VD; - SourceLocation FixLoc = VD->getTypeSpecStartLoc(); + const SourceLocation FixLoc = VD->getInnerLocStart(); if (FixLoc.isInvalid() || FixLoc.isMacroID()) return; if (FixMode == FixModeKind::UseStatic) diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp index df2b0bef576ca37a4842fbbf30e37480b69f5422..92c3e0ed7894e1eaef71911cb0a37dc00923ff66 100644 --- a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp @@ -24,13 +24,15 @@ getPreviousTokenAndStart(SourceLocation Location, const SourceManager &SM, if (Location.isInvalid()) return {Token, Location}; - auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Location)); + const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Location)); while (Location != StartOfFile) { Location = Lexer::GetBeginningOfToken(Location, SM, LangOpts); if (!Lexer::getRawToken(Location, Token, SM, LangOpts) && (!SkipComments || !Token.is(tok::comment))) { break; } + if (Location == StartOfFile) + return {Token, Location}; Location = Location.getLocWithOffset(-1); } return {Token, Location}; diff --git a/clang-tools-extra/clangd/test/.gitattributes b/clang-tools-extra/clangd/test/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..20971adc2b5d0339d95fdc505e6e42b34b8419a3 --- /dev/null +++ b/clang-tools-extra/clangd/test/.gitattributes @@ -0,0 +1,3 @@ +input-mirror.test text eol=crlf +too_large.test text eol=crlf +protocol.test text eol=crlf diff --git a/clang-tools-extra/clangd/test/input-mirror.test b/clang-tools-extra/clangd/test/input-mirror.test index a34a4a08cf60cfaedadf9c8cc78fd7aee05b0726..bce3f9923a3b9011f7febc5d308f98686bc29e39 100644 --- a/clang-tools-extra/clangd/test/input-mirror.test +++ b/clang-tools-extra/clangd/test/input-mirror.test @@ -1,17 +1,17 @@ -# RUN: clangd -pretty -sync -input-mirror-file %t < %s -# Note that we have to use '-b' as -input-mirror-file does not have a newline at the end of file. -# RUN: diff -b %t %s -# It is absolutely vital that this file has CRLF line endings. -# -Content-Length: 125 - -{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} -Content-Length: 172 - -{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"int main() {\nint a;\na;\n}\n"}}} -Content-Length: 44 - -{"jsonrpc":"2.0","id":3,"method":"shutdown"} -Content-Length: 33 - -{"jsonrpc":"2.0","method":"exit"} +# RUN: clangd -pretty -sync -input-mirror-file %t < %s +# Note that we have to use '-b' as -input-mirror-file does not have a newline at the end of file. +# RUN: diff -b %t %s +# It is absolutely vital that this file has CRLF line endings. +# +Content-Length: 125 + +{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} +Content-Length: 172 + +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"int main() {\nint a;\na;\n}\n"}}} +Content-Length: 44 + +{"jsonrpc":"2.0","id":3,"method":"shutdown"} +Content-Length: 33 + +{"jsonrpc":"2.0","method":"exit"} diff --git a/clang-tools-extra/clangd/test/protocol.test b/clang-tools-extra/clangd/test/protocol.test index 5e852d1d9deebca1791566d85d96bfcc8594b10a..64ccfaef18911157123ca4f6b6710464b084bf8f 100644 --- a/clang-tools-extra/clangd/test/protocol.test +++ b/clang-tools-extra/clangd/test/protocol.test @@ -1,113 +1,113 @@ -# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s | FileCheck -strict-whitespace %s -# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s 2>&1 | FileCheck -check-prefix=STDERR %s -# vim: fileformat=dos -# It is absolutely vital that this file has CRLF line endings. -# -# Note that we invert the test because we intent to let clangd exit prematurely. -# -# Test protocol parsing -Content-Length: 125 -Content-Type: application/vscode-jsonrpc; charset-utf-8 - -{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} -# Test message with Content-Type after Content-Length -# -# CHECK: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK: } -Content-Length: 246 - -{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct fake { int a, bb, ccc; int f(int i, const float f) const; };\nint main() {\n fake f;\n f.\n}\n"}}} - -Content-Length: 104 - -{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp"}}} - -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 146 - -{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with Content-Type before Content-Length -# -# CHECK: "id": 1, -# CHECK-NEXT: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK-NEXT: "isIncomplete": false, -# CHECK-NEXT: "items": [ -# CHECK: "filterText": "a", -# CHECK-NEXT: "insertText": "a", -# CHECK-NEXT: "insertTextFormat": 1, -# CHECK-NEXT: "kind": 5, -# CHECK-NEXT: "label": " a", -# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, -# CHECK-NEXT: "sortText": "{{.*}}" -# CHECK: ] -# CHECK-NEXT: } - -X-Test: Testing -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 146 -Content-Type: application/vscode-jsonrpc; charset-utf-8 -X-Testing: Test - -{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} - -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 10 -Content-Length: 146 - -{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with duplicate Content-Length headers -# -# CHECK: "id": 3, -# CHECK-NEXT: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK-NEXT: "isIncomplete": false, -# CHECK-NEXT: "items": [ -# CHECK: "filterText": "a", -# CHECK-NEXT: "insertText": "a", -# CHECK-NEXT: "insertTextFormat": 1, -# CHECK-NEXT: "kind": 5, -# CHECK-NEXT: "label": " a", -# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, -# CHECK-NEXT: "sortText": "{{.*}}" -# CHECK: ] -# CHECK-NEXT: } -# STDERR: Warning: Duplicate Content-Length header received. The previous value for this message (10) was ignored. - -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 10 - -{"jsonrpc":"2.0","id":4,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with malformed Content-Length -# -# STDERR: JSON parse error -# Ensure we recover by sending another (valid) message - -Content-Length: 146 - -{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with Content-Type before Content-Length -# -# CHECK: "id": 5, -# CHECK-NEXT: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK-NEXT: "isIncomplete": false, -# CHECK-NEXT: "items": [ -# CHECK: "filterText": "a", -# CHECK-NEXT: "insertText": "a", -# CHECK-NEXT: "insertTextFormat": 1, -# CHECK-NEXT: "kind": 5, -# CHECK-NEXT: "label": " a", -# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, -# CHECK-NEXT: "sortText": "{{.*}}" -# CHECK: ] -# CHECK-NEXT: } -Content-Length: 1024 - -{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message which reads beyond the end of the stream. -# -# Ensure this is the last test in the file! -# STDERR: Input was aborted. Read only {{[0-9]+}} bytes of expected {{[0-9]+}}. - +# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s | FileCheck -strict-whitespace %s +# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s 2>&1 | FileCheck -check-prefix=STDERR %s +# vim: fileformat=dos +# It is absolutely vital that this file has CRLF line endings. +# +# Note that we invert the test because we intent to let clangd exit prematurely. +# +# Test protocol parsing +Content-Length: 125 +Content-Type: application/vscode-jsonrpc; charset-utf-8 + +{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} +# Test message with Content-Type after Content-Length +# +# CHECK: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: } +Content-Length: 246 + +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct fake { int a, bb, ccc; int f(int i, const float f) const; };\nint main() {\n fake f;\n f.\n}\n"}}} + +Content-Length: 104 + +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp"}}} + +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 146 + +{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with Content-Type before Content-Length +# +# CHECK: "id": 1, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [ +# CHECK: "filterText": "a", +# CHECK-NEXT: "insertText": "a", +# CHECK-NEXT: "insertTextFormat": 1, +# CHECK-NEXT: "kind": 5, +# CHECK-NEXT: "label": " a", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}" +# CHECK: ] +# CHECK-NEXT: } + +X-Test: Testing +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 146 +Content-Type: application/vscode-jsonrpc; charset-utf-8 +X-Testing: Test + +{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} + +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 10 +Content-Length: 146 + +{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with duplicate Content-Length headers +# +# CHECK: "id": 3, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [ +# CHECK: "filterText": "a", +# CHECK-NEXT: "insertText": "a", +# CHECK-NEXT: "insertTextFormat": 1, +# CHECK-NEXT: "kind": 5, +# CHECK-NEXT: "label": " a", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}" +# CHECK: ] +# CHECK-NEXT: } +# STDERR: Warning: Duplicate Content-Length header received. The previous value for this message (10) was ignored. + +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 10 + +{"jsonrpc":"2.0","id":4,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with malformed Content-Length +# +# STDERR: JSON parse error +# Ensure we recover by sending another (valid) message + +Content-Length: 146 + +{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with Content-Type before Content-Length +# +# CHECK: "id": 5, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [ +# CHECK: "filterText": "a", +# CHECK-NEXT: "insertText": "a", +# CHECK-NEXT: "insertTextFormat": 1, +# CHECK-NEXT: "kind": 5, +# CHECK-NEXT: "label": " a", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}" +# CHECK: ] +# CHECK-NEXT: } +Content-Length: 1024 + +{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message which reads beyond the end of the stream. +# +# Ensure this is the last test in the file! +# STDERR: Input was aborted. Read only {{[0-9]+}} bytes of expected {{[0-9]+}}. + diff --git a/clang-tools-extra/clangd/test/too_large.test b/clang-tools-extra/clangd/test/too_large.test index 7df981e7942073a8ec616c7e01ba21bf4cb15461..6986bd5e258e87a89444d9e53f39ac9fc317ee7c 100644 --- a/clang-tools-extra/clangd/test/too_large.test +++ b/clang-tools-extra/clangd/test/too_large.test @@ -1,7 +1,7 @@ -# RUN: not clangd -sync < %s 2>&1 | FileCheck -check-prefix=STDERR %s -# vim: fileformat=dos -# It is absolutely vital that this file has CRLF line endings. -# -Content-Length: 2147483648 - -# STDERR: Refusing to read message +# RUN: not clangd -sync < %s 2>&1 | FileCheck -check-prefix=STDERR %s +# vim: fileformat=dos +# It is absolutely vital that this file has CRLF line endings. +# +Content-Length: 2147483648 + +# STDERR: Refusing to read message diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 95be0a89cd6c93a5ba88d76e3e9131f378b45638..e8148e06b6af2872b557edd4a288026cd73fdc33 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -195,6 +195,10 @@ Changes in existing checks ` check to fix false positive when using loop variable in initializer of lambda capture. +- Improved :doc:`misc-use-internal-linkage + ` check to insert ``static`` keyword + before type qualifiers such as ``const`` and ``volatile``. + - Improved :doc:`modernize-min-max-use-initializer-list ` check by fixing a false positive when only an implicit conversion happened inside an diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst index a498ff8409af3ae0d00e09da274db71c1c44e19f..3f6cee9b3bb5aa97f442999eca91e644b230d0d0 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst @@ -28,7 +28,7 @@ Violating the naming rules above results in undefined behavior. int _g(); // disallowed in global namespace only The check can also be inverted, i.e. it can be configured to flag any -identifier that is _not_ a reserved identifier. This mode is for use by e.g. +identifier that is *not* a reserved identifier. This mode is for use by e.g. standard library implementors, to ensure they don't infringe on the user namespace. diff --git a/clang-tools-extra/include-cleaner/test/tool.cpp b/clang-tools-extra/include-cleaner/test/tool.cpp index 2155eec189d186dffd86923d2945ce3252da1ffe..d72d2317ce2b1da26bc67fd9c8e91a76b6eeae1b 100644 --- a/clang-tools-extra/include-cleaner/test/tool.cpp +++ b/clang-tools-extra/include-cleaner/test/tool.cpp @@ -48,3 +48,13 @@ int x = foo(); // RUN: clang-include-cleaner -edit --ignore-headers="foobar\.h,foo\.h" %t.cpp -- -I%S/Inputs/ // RUN: FileCheck --match-full-lines --check-prefix=EDIT2 %s < %t.cpp // EDIT2-NOT: {{^}}#include "foo.h"{{$}} + +// RUN: rm -rf %t.dir && mkdir -p %t.dir +// RUN: cp %s %t.cpp +// RUN: echo "[{\"directory\":\"%t.dir\",\"file\":\"../%{t:stem}.tmp.cpp\",\"command\":\":clang++ -I%S/Inputs/ ../%{t:stem}.tmp.cpp\"}]" | sed -e 's/\\/\\\\/g' > %t.dir/compile_commands.json +// RUN: pushd %t.dir +// RUN: clang-include-cleaner -p %{t:stem}.tmp.dir -edit ../%{t:stem}.tmp.cpp +// RUN: popd +// RUN: FileCheck --match-full-lines --check-prefix=EDIT3 %s < %t.cpp +// EDIT3: #include "foo.h" +// EDIT3-NOT: {{^}}#include "foobar.h"{{$}} diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp index 080099adc9a07aa04c2b8b21726c9b4603048fc0..f85dbc0e0c31f23d007b58678e5332cb047d35c9 100644 --- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp +++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp @@ -173,9 +173,11 @@ private: if (!HTMLReportPath.empty()) writeHTML(); - llvm::StringRef Path = - SM.getFileEntryRefForID(SM.getMainFileID())->getName(); - assert(!Path.empty() && "Main file path not known?"); + // Source File's path of compiler invocation, converted to absolute path. + llvm::SmallString<256> AbsPath( + SM.getFileEntryRefForID(SM.getMainFileID())->getName()); + assert(!AbsPath.empty() && "Main file path not known?"); + SM.getFileManager().makeAbsolutePath(AbsPath); llvm::StringRef Code = SM.getBufferData(SM.getMainFileID()); auto Results = @@ -185,7 +187,7 @@ private: Results.Missing.clear(); if (!Remove) Results.Unused.clear(); - std::string Final = fixIncludes(Results, Path, Code, getStyle(Path)); + std::string Final = fixIncludes(Results, AbsPath, Code, getStyle(AbsPath)); if (Print.getNumOccurrences()) { switch (Print) { @@ -202,7 +204,7 @@ private: } if (!Results.Missing.empty() || !Results.Unused.empty()) - EditedFiles.try_emplace(Path, Final); + EditedFiles.try_emplace(AbsPath, Final); } void writeHTML() { @@ -280,6 +282,48 @@ std::function headerFilter() { }; } +// Maps absolute path of each files of each compilation commands to the +// absolute path of the input file. +llvm::Expected> +mapInputsToAbsPaths(clang::tooling::CompilationDatabase &CDB, + llvm::IntrusiveRefCntPtr VFS, + const std::vector &Inputs) { + std::map CDBToAbsPaths; + // Factory.editedFiles()` will contain the final code, along with the + // path given in the compilation database. That path can be + // absolute or relative, and if it is relative, it is relative to the + // "Directory" field in the compilation database. We need to make it + // absolute to write the final code to the correct path. + for (auto &Source : Inputs) { + llvm::SmallString<256> AbsPath(Source); + if (auto Err = VFS->makeAbsolute(AbsPath)) { + llvm::errs() << "Failed to get absolute path for " << Source << " : " + << Err.message() << '\n'; + return llvm::errorCodeToError(Err); + } + std::vector Cmds = + CDB.getCompileCommands(AbsPath); + if (Cmds.empty()) { + // It should be found in the compilation database, even user didn't + // specify the compilation database, the `FixedCompilationDatabase` will + // create an entry from the arguments. So it is an error if we can't + // find the compile commands. + std::string ErrorMsg = + llvm::formatv("No compile commands found for {0}", AbsPath).str(); + llvm::errs() << ErrorMsg << '\n'; + return llvm::make_error( + ErrorMsg, llvm::inconvertibleErrorCode()); + } + for (const auto &Cmd : Cmds) { + llvm::SmallString<256> CDBPath(Cmd.Filename); + std::string Directory(Cmd.Directory); + llvm::sys::fs::make_absolute(Cmd.Directory, CDBPath); + CDBToAbsPaths[std::string(CDBPath)] = std::string(AbsPath); + } + } + return CDBToAbsPaths; +} + } // namespace } // namespace include_cleaner } // namespace clang @@ -305,8 +349,16 @@ int main(int argc, const char **argv) { } } - clang::tooling::ClangTool Tool(OptionsParser->getCompilations(), - OptionsParser->getSourcePathList()); + auto VFS = llvm::vfs::getRealFileSystem(); + auto &CDB = OptionsParser->getCompilations(); + // CDBToAbsPaths is a map from the path in the compilation database to the + // writable absolute path of the file. + auto CDBToAbsPaths = + mapInputsToAbsPaths(CDB, VFS, OptionsParser->getSourcePathList()); + if (!CDBToAbsPaths) + return 1; + + clang::tooling::ClangTool Tool(CDB, OptionsParser->getSourcePathList()); auto HeaderFilter = headerFilter(); if (!HeaderFilter) @@ -316,6 +368,10 @@ int main(int argc, const char **argv) { if (Edit) { for (const auto &NameAndContent : Factory.editedFiles()) { llvm::StringRef FileName = NameAndContent.first(); + if (auto It = CDBToAbsPaths->find(FileName.str()); + It != CDBToAbsPaths->end()) + FileName = It->second; + const std::string &FinalCode = NameAndContent.second; if (auto Err = llvm::writeToOutput( FileName, [&](llvm::raw_ostream &OS) -> llvm::Error { diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp index 9c91389542b03d8432a8c0e4987063c2a41d67c4..8dc739da3a2734b299433d338a9ec26f85b802db 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp @@ -17,6 +17,41 @@ void func_cpp_inc(); // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc' // CHECK-FIXES: static void func_cpp_inc(); +int* func_cpp_inc_return_ptr(); +// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc_return_ptr' +// CHECK-FIXES: static int* func_cpp_inc_return_ptr(); + +const int* func_cpp_inc_return_const_ptr(); +// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_const_ptr' +// CHECK-FIXES: static const int* func_cpp_inc_return_const_ptr(); + +int const* func_cpp_inc_return_ptr_const(); +// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_ptr_const' +// CHECK-FIXES: static int const* func_cpp_inc_return_ptr_const(); + +int * const func_cpp_inc_return_const(); +// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: function 'func_cpp_inc_return_const' +// CHECK-FIXES: static int * const func_cpp_inc_return_const(); + +volatile const int* func_cpp_inc_return_volatile_const_ptr(); +// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: function 'func_cpp_inc_return_volatile_const_ptr' +// CHECK-FIXES: static volatile const int* func_cpp_inc_return_volatile_const_ptr(); + +[[nodiscard]] void func_nodiscard(); +// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: function 'func_nodiscard' +// CHECK-FIXES: {{\[\[nodiscard\]\]}} static void func_nodiscard(); + +#define NDS [[nodiscard]] +#define NNDS + +NDS void func_nds(); +// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: function 'func_nds' +// CHECK-FIXES: NDS static void func_nds(); + +NNDS void func_nnds(); +// CHECK-MESSAGES: :[[@LINE-1]]:11: warning: function 'func_nnds' +// CHECK-FIXES: NNDS static void func_nnds(); + #include "func_cpp.inc" void func_h_inc(); diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp index 6777ce4bb0265e9b117f640b8e5e47dd72dd5f5b..901272e40b8f24dadaf74c01ae440946de8a04aa 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp @@ -13,6 +13,18 @@ T global_template; // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: variable 'global_template' // CHECK-FIXES: static T global_template; +int const* ptr_const_star; +// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'ptr_const_star' +// CHECK-FIXES: static int const* ptr_const_star; + +const int* const_ptr_star; +// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'const_ptr_star' +// CHECK-FIXES: static const int* const_ptr_star; + +const volatile int* const_volatile_ptr_star; +// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: variable 'const_volatile_ptr_star' +// CHECK-FIXES: static const volatile int* const_volatile_ptr_star; + int gloabl_header; extern int global_extern; diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 8add0a53e5be136c83b14af36ffc0033135e942d..f36a5472b7e17d64b011b96a95ef9d054e560d01 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -5505,6 +5505,31 @@ the configuration (without a prefix: ``Auto``). } } +.. _RemoveEmptyLinesInUnwrappedLines: + +**RemoveEmptyLinesInUnwrappedLines** (``Boolean``) :versionbadge:`clang-format 20` :ref:`¶ ` + Remove empty lines within unwrapped lines. + + .. code-block:: c++ + + false: true: + + int c vs. int c = a + b; + + = a + b; + + enum : unsigned vs. enum : unsigned { + AA = 0, + { BB + AA = 0, } myEnum; + BB + } myEnum; + + while ( vs. while (true) { + } + true) { + } + .. _RemoveParentheses: **RemoveParentheses** (``RemoveParenthesesStyle``) :versionbadge:`clang-format 17` :ref:`¶ ` diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html index a16b9c44ef0eab493d07b6a148ee20519430da03..c6307954d7f1bb88dbae8cc62833271861fdada9 100644 --- a/clang/docs/LibASTMatchersReference.html +++ b/clang/docs/LibASTMatchersReference.html @@ -7239,9 +7239,9 @@ Given void z(Y y, X x) { y.m(); x.m(); x.g(); (g()).m(); } cxxMemberCallExpr(onImplicitObjectArgument(hasType( cxxRecordDecl(hasName("Y"))))) - matches `y.m()`, `x.m()` and (g()).m(), but not `x.g()`. + matches `y.m()`, `x.m()` and (`g()).m()`, but not `x.g()`). cxxMemberCallExpr(on(callExpr())) - does not match `(g()).m()`, because the parens are not ignored. + only matches `(g()).m()` (the parens are ignored). FIXME: Overload to allow directly matching types? diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 817e3abef8d5669cd29601cef58c31c0febaf73f..a65bd6f382901bf41ea7705836c44482aa6f020d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -99,6 +99,26 @@ C++ Specific Potentially Breaking Changes // Was error, now evaluates to false. constexpr bool b = f() == g(); +- Clang will now correctly not consider pointers to non classes for covariance + and disallow changing return type to a type that doesn't have the same or less cv-qualifications. + + .. code-block:: c++ + + struct A { + virtual const int *f() const; + virtual const std::string *g() const; + }; + struct B : A { + // Return type has less cv-qualification but doesn't point to a class. + // Error will be generated. + int *f() const override; + + // Return type doesn't have more cv-qualification also not the same or + // less cv-qualification. + // Error will be generated. + volatile std::string *g() const override; + }; + - The warning ``-Wdeprecated-literal-operator`` is now on by default, as this is something that WG21 has shown interest in removing from the language. The result is that anyone who is compiling with ``-Werror`` should see this @@ -599,12 +619,19 @@ X86 Support Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ +- In the ARM Target, the frame pointer (FP) of a leaf function can be retained + by using the ``-fno-omit-frame-pointer`` option. If you want to eliminate the FP + in leaf functions after enabling ``-fno-omit-frame-pointer``, you can do so by adding + the ``-momit-leaf-frame-pointer`` option. + Android Support ^^^^^^^^^^^^^^^ Windows Support ^^^^^^^^^^^^^^^ +- clang-cl now supports ``/std:c++23preview`` which enables C++23 features. + - Clang no longer allows references inside a union when emulating MSVC 1900+ even if `fms-extensions` is enabled. Starting with VS2015, MSVC 1900, this Microsoft extension is no longer allowed and always results in an error. Clang now follows the MSVC behavior in this scenario. @@ -672,8 +699,10 @@ clang-format - Adds ``BreakBinaryOperations`` option. - Adds ``TemplateNames`` option. - Adds ``AlignFunctionDeclarations`` option to ``AlignConsecutiveDeclarations``. -- Adds ``IndentOnly`` suboption to ``ReflowComments`` to fix the indentation of multi-line comments - without touching their contents, renames ``false`` to ``Never``, and ``true`` to ``Always``. +- Adds ``IndentOnly`` suboption to ``ReflowComments`` to fix the indentation of + multi-line comments without touching their contents, renames ``false`` to + ``Never``, and ``true`` to ``Always``. +- Adds ``RemoveEmptyLinesInUnwrappedLines`` option. libclang -------- diff --git a/clang/docs/SafeBuffers.rst b/clang/docs/SafeBuffers.rst new file mode 100644 index 0000000000000000000000000000000000000000..144c3a76a5832f1f169f9e4fd57128d3717bce51 --- /dev/null +++ b/clang/docs/SafeBuffers.rst @@ -0,0 +1,585 @@ +================ +C++ Safe Buffers +================ + +.. contents:: + :local: + + +Introduction +============ + +Clang can be used to harden your C++ code against buffer overflows, an otherwise +common security issue with C-based languages. + +The solution described in this document is an integrated programming model as +it combines: + +- a family of opt-in Clang warnings (``-Wunsafe-buffer-usage``) emitted at + during compilation to help you update your code to encapsulate and propagate + the bounds information associated with pointers; +- runtime assertions implemented as part of + (`libc++ hardening modes `_) + that eliminate undefined behavior as long as the coding convention + is followed and the bounds information is therefore available and correct. + +The goal of this work is to enable development of bounds-safe C++ code. It is +not a "push-button" solution; depending on your codebase's existing +coding style, significant (even if largely mechanical) changes to your code +may be necessary. However, it allows you to achieve valuable safety guarantees +on security-critical parts of your codebase. + +This solution is under active development. It is already useful for its purpose +but more work is being done to improve ergonomics and safety guarantees +and reduce adoption costs. + +The solution aligns in spirit with the "Ranges" safety profile +that was `proposed `_ +by Bjarne Stroustrup for standardization alongside other C++ safety features. + + +Pre-Requisites +============== + +In order to achieve bounds safety, your codebase needs to have access to +well-encapsulated bounds-safe container, view, and iterator types. +If your project uses libc++, standard container and view types such as +``std::vector`` and ``std::span`` can be made bounds-safe by enabling +the "fast" `hardening mode `_ +(passing ``-D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST``) to your +compiler) or any of the stricter hardening modes. + +In order to harden iterators, you'll need to also obtain a libc++ binary +built with ``_LIBCPP_ABI_BOUNDED_ITERATORS`` -- which is a libc++ ABI setting +that needs to be set for your entire target platform if you need to maintain +binary compatibility with the rest of the platform. + +A relatively fresh version of C++ is recommended. In particular, the very useful +standard view class ``std::span`` requires C++20. + +Other implementations of the C++ standard library may provide different +flags to enable such hardening hardening. + +If you're using custom containers and views, they will need to be hardened +this way as well, but you don't necessarily need to do this ahead of time. + +This approach can theoretically be applied to plain C codebases, +assuming that safe primitives are developed to encapsulate all buffer accesses, +acting as "hardened custom containers" to replace raw pointers. +However, such approach would be very unergonomic in C, and safety guarantees +will be lower due to lack of good encapsulation technology. A better approach +to bounds safety for non-C++ programs, +`-fbounds-safety `_, +is currently in development. + +Technically, safety guarantees cannot be provided without hardening +the entire technology stack, including all of your dependencies. +However, applying such hardening technology to even a small portion +of your code may be significantly better than nothing. + + +The Programming Model for C++ +============================= + +Assuming that hardened container, view, and iterator classes are available, +what remains is to make sure they are used consistently in your code. +Below we define the specific coding convention that needs to be followed +in order to guarantee safety and how the compiler technology +around ``-Wunsafe-buffer-usage`` assists with that. + + +Buffer operations should never be performed over raw pointers +------------------------------------------------------------- + +Every time a memory access is made, a bounds-safe program must guarantee +that the range of accessed memory addresses falls into the boundaries +of the memory allocated for the object that's being accessed. +In order to establish such a guarantee, the information about such valid range +of addresses -- the **bounds information** associated with the accessed address +-- must be formally available every time a memory access is performed. + +A raw pointer does not naturally carry any bounds information. +The bounds information for the pointer may be available *somewhere*, but +it is not associated with the pointer in a formal manner, so a memory access +performed through a raw pointer cannot be automatically verified to be +bounds-safe by the compiler. + +That said, the Safe Buffers programming model does **not** try to eliminate +**all** pointer usage. Instead it assumes that most pointers point to +individual objects, not buffers, and therefore they typically aren't +associated with buffer overflow risks. For that reason, in order to identify +the code that requires manual intervention, it is desirable to initially shift +the focus away from the pointers themselves, and instead focus on their +**usage patterns**. + +The compiler warning ``-Wunsafe-buffer-usage`` is built to assist you +with this step of the process. A ``-Wunsafe-buffer-usage`` warning is +emitted whenever one of the following **buffer operations** are performed +on a raw pointer: + +- array indexing with ``[]``, +- pointer arithmetic, +- bounds-unsafe standard C functions such as ``std::memcpy()``, +- C++ smart pointer operations such as ``std::unique_ptr::operator[]()``, + which unfortunately cannot be made fully safe within the rules of + the C++ standard (as of C++23). + +This is sufficient for identifying each raw buffer pointer in the program at +**at least one point** during its lifetime across your software stack. + +For example, both of the following functions are flagged by +``-Wunsafe-buffer-usage`` because ``pointer`` gets identified as an unsafe +buffer pointer. Even though the second function does not directly access +the buffer, the pointer arithmetic operation inside it may easily be +the only formal "hint" in the program that the pointer does indeed point +to a buffer of multiple objects:: + + int get_last_element(int *pointer, size_t size) { + return ptr[sz - 1]; // warning: unsafe buffer access + } + + int *get_last_element_ptr(int *pointer, size_t size) { + return ptr + (size - 1); // warning: unsafe pointer arithmetic + } + + +All buffers need to be encapsulated into safe container and view types +---------------------------------------------------------------------- + +It immediately follows from the previous requirement that once an unsafe pointer +is identified at any point during its lifetime, it should be immediately wrapped +into a safe container type (if the allocation site is "nearby") or a safe +view type (if the allocation site is "far away"). Not only memory accesses, +but also non-access operations such as pointer arithmetic need to be covered +this way in order to benefit from the respective runtime bounds checks. + +If a **container** type (``std::array``, ``std::vector``, ``std::string``) +is used for allocating the buffer, this is the best-case scenario because +the container naturally has access to the correct bounds information for the +buffer, and the runtime bounds checks immediately kick in. Additionally, +the container type may provide automatic lifetime management for the buffer +(which may or may not be desirable). + +If a **view** type is used (``std::span``, ``std::string_view``), this typically +means that the bounds information for the "adopted" pointer needs to be passed +to the view's constructor manually. This makes runtime checks immediately +kick in with respect to the provided bounds information, which is an immediate +improvement over the raw pointer. However, this situation is still fundamentally +insufficient for security purposes, because **bounds information provided +this way cannot be guaranteed to be correct**. + +For example, the function ``get_last_element()`` we've seen in the previous +section can be made **slightly** safer this way:: + + int get_last_element(int *pointer, size_t size) { + std::span sp(pointer, size); + return sp[size - 1]; // warning addressed + } + +Here ``std::span`` eliminates the potential concern that the operation +``size - 1`` may overflow when ``sz`` is equal to ``0``, leading to a buffer +"underrun". However, such program does not provide a guarantee that +the variable ``sz`` correctly represents the **actual** size fo the buffer +pointed to by ``ptr``. The ``std::span`` constructed this way may be ill-formed. +It may fail to protect you from overrunning the original buffer. + +The following example demonstrates one of the most dangerous anti-patterns +of this nature:: + + void convert_data(int *source_buf, size_t source_size, + int *target_buf, size_t target_size) { + // Terrible: mismatched pointer / size. + std::span target_span(target_buf, source_size); + // ... + } + +The second parameter of ``std::span`` should never be the **desired** size +of the buffer. It should always be the **actual** size of the buffer. +Such code often indicates that the original code has already contained +a vulnerability -- and the use of a safe view class failed to prevent it. + +If ``target_span`` actually needs to be of size ``source_size``, a significantly +safer way to produce such a span would be to build it with the correct size +first, and then resize it to the desired size by calling ``.first()``:: + + void convert_data(int *source_buf, size_t source_size, + int *target_buf, size_t target_size) { + // Safer. + std::span target_span(target_buf, target_size).first(source_size); + // ... + } + +However, these are still half-measures. This code still accepts the +bounds information from the caller in an **informal** manner, and such bounds +information cannot be guaranteed to be correct. + +In order to mitigate problems of this nature in their entirety, +the third guideline is imposed. + + +Encapsulation of bounds information must be respected continuously +------------------------------------------------------------------ + +The allocation site of the object is the only reliable source of bounds +information for that object. For objects with long lifespans across +multiple functions or even libraries in the software stack, it is essential +to formally preserve the original bounds information as it's being passed +from one piece of code to another. + +Standard container and view classes are designed to preserve bounds information +correctly **by construction**. However, they offer a number of ways to "break" +encapsulation, which may cause you to temporarily lose track of the correct +bounds information: + +- The two-parameter constructor ``std::span(ptr, size)`` allows you to + assemble an ill-formed ``std::span``; +- Conversely, you can unwrap a container or a view object into a raw pointer + and a raw size by calling its ``.data()`` and ``.size()`` methods. +- The overloaded ``operator&()`` found on container and iterator classes + acts similarly to ``.data()`` in this regard; operations such as + ``&span[0]`` and ``&*span.begin()`` are effectively unsafe. + +Additional ``-Wunsafe-buffer-usage`` warnings are emitted when encapsulation +of **standard** containers is broken in this manner. If you're using +non-standard containers, you can achieve a similar effect with facilities +described in the next section: :ref:`customization`. + +For example, our previous attempt to address the warning in +``get_last_element()`` has actually introduced a new warning along the way, +that notifies you about the potentially incorrect bounds information +passed into the two-parameter constructor of ``std::span``:: + + int get_last_element(int *pointer, size_t size) { + std::span sp(pointer, size); // warning: unsafe constructor + return sp[size - 1]; + } + +In order to address this warning, you need to make the function receive +the bounds information from the allocation site in a formal manner. +The function doesn't necessarily need to know where the allocation site is; +it simply needs to be able to accept bounds information **when** it's available. +You can achieve this by refactoring the function to accept a ``std::span`` +as a parameter:: + + int get_last_element(std::span sp) { + return sp[size - 1]; + } + +This solution puts the responsibility for making sure the span is well-formed +on the **caller**. They should do the same, so that eventually the +responsibility is placed on the allocation site! + +Such definition is also very ergonomic as it naturally accepts arbitrary +standard containers without any additional code at the call site:: + + void use_last_element() { + std::vector vec { 1, 2, 3 }; + int x = get_last_element(vec); // x = 3 + } + +Such code is naturally bounds-safe because bounds-information is passed down +from the allocation site to the buffer access site. Only safe operations +are performed on container types. The containers are never "unforged" into +raw pointer-size pairs and never "reforged" again. This is what ideal +bounds-safe C++ code looks like. + + +.. _customization: + +Backwards Compatibility, Interoperation with Unsafe Code, Customization +======================================================================= + +Some of the code changes described above can be somewhat intrusive. +For example, changing a function that previously accepted a pointer and a size +separately, to accept a ``std::span`` instead, may require you to update +every call site of the function. This is often undesirable and sometimes +completely unacceptable when backwards compatibility is required. + +In order to facilitate **incremental adoption** of the coding convention +described above, as well as to handle various unusual situations, the compiler +provides two additional facilities to give the user more control over +``-Wunsafe-buffer-usage`` diagnostics: + +- ``#pragma clang unsafe_buffer_usage`` to mark code as unsafe and **suppress** + ``-Wunsafe-buffer-usage`` warnings in that code. +- ``[[clang::unsafe_buffer_usage]]`` to annotate potential sources of + discontinuity of bounds information -- thus introducing + **additional** ``-Wunsafe-buffer-usage`` warnings. + +In this section we describe these facilities in detail and show how they can +help you with various unusual situations. + +Suppress unwanted warnings with ``#pragma clang unsafe_buffer_usage`` +--------------------------------------------------------------------- + +If you really need to write unsafe code, you can always suppress all +``-Wunsafe-buffer-usage`` warnings in a section of code by surrounding +that code with the ``unsafe_buffer_usage`` pragma. For example, if you don't +want to address the warning in our example function ``get_last_element()``, +here is how you can suppress it:: + + int get_last_element(int *pointer, size_t size) { + #pragma clang unsafe_buffer_usage begin + return ptr[sz - 1]; // warning suppressed + #pragma clang unsafe_buffer_usage end + } + +This behavior is analogous to ``#pragma clang diagnostic`` (`documentation +`_) +However, ``#pragma clang unsafe_buffer_usage`` is specialized and recommended +over ``#pragma clang diagnostic`` for a number of technical and non-technical +reasons. Most importantly, ``#pragma clang unsafe_buffer_usage`` is more +suitable for security audits because it is significantly simpler and +describes unsafe code in a more formal manner. On the contrary, +``#pragma clang diagnostic`` comes with a push/pop syntax (as opposed to +the begin/end syntax) and it offers ways to suppress warnings without +mentioning them by name (such as ``-Weverything``), which can make it +difficult to determine at a glance whether the warning is suppressed +on any given line of code. + +There are a few natural reasons to use this pragma: + +- In implementations of safe custom containers. You need this because ultimately + ``-Wunsafe-buffer-usage`` cannot help you verify that your custom container + is safe. It will naturally remind you to audit your container's implementation + to make sure it has all the necessary runtime checks, but ultimately you'll + need to suppress it once the audit is complete. +- In performance-critical code where bounds-safety-related runtime checks + cause an unacceptable performance regression. The compiler can theoretically + optimize them away (eg. replace a repeated bounds check in a loop with + a single check before the loop) but it is not guaranteed to do that. +- For incremental adoption purposes. If you want to adopt the coding convention + gradually, you can always surround an entire file with the + ``unsafe_buffer_usage`` pragma and then "make holes" in it whenever + you address warnings on specific portions of the code. +- In the code that interoperates with unsafe code. This may be code that + will never follow the programming model (such as plain C code that will + never be converted to C++) or with the code that simply haven't been converted + yet. + +Interoperation with unsafe code may require a lot of suppressions. +You are encouraged to introduce "unsafe wrapper functions" for various unsafe +operations that you need to perform regularly. + +For example, if you regularly receive pointer/size pairs from unsafe code, +you may want to introduce a wrapper function for the unsafe span constructor:: + + #pragma clang unsafe_buffer_usage begin + + template + std::span unsafe_forge_span(T *pointer, size_t size) { + return std::span(pointer, size); + } + + #pragma clang unsafe_buffer_usage end + +Such wrapper function can be used to suppress warnings about unsafe span +constructor usage in a more ergonomic manner:: + + void use_unsafe_c_struct(unsafe_c_struct *s) { + // No warning here. + std::span sp = unsafe_forge_span(s->pointer, s->size); + // ... + } + +The code remains unsafe but it also continues to be nicely readable, and it +proves that ``-Wunsafe-buffer-usage`` has done it best to notify you about +the potential unsafety. A security auditor will need to keep an eye on such +unsafe wrappers. **It is still up to you to confirm that the bounds information +passed into the wrapper is correct.** + + +Flag bounds information discontinuities with ``[[clang::unsafe_buffer_usage]]`` +------------------------------------------------------------------------------- + +The clang attribute ``[[clang::unsafe_buffer_usage]]`` +(`attribute documentation +`_) +allows the user to annotate various objects, such as functions or member +variables, as incompatible with the Safe Buffers programming model. +You are encouraged to do that for arbitrary reasons, but typically the main +reason to do that is when an unsafe function needs to be provided for +backwards compatibility. + +For example, in the previous section we've seen how the example function +``get_last_element()`` needed to have its parameter types changed in order +to preserve the continuity of bounds information when receiving a buffer pointer +from the caller. However, such a change breaks both API and ABI compatibility. +The code that previously used this function will no longer compile, nor link, +until every call site of that function is updated. You can reclaim the +backwards compatibility -- in terms of both API and ABI -- by adding +a "compatibility overload":: + + int get_last_element(std::span sp) { + return sp[size - 1]; + } + + [[clang::unsafe_buffer_usage]] // Please use the new function. + int get_last_element(int *pointer, size_t size) { + // Avoid code duplication - simply invoke the safe function! + // The pragma suppresses the unsafe constructor warning. + #pragma clang unsafe_buffer_usage begin + return get_last_element(std::span(pointer, size)); + #pragma clang unsafe_buffer_usage end + } + + +Such an overload allows the surrounding code to continue to work. +It is both source-compatible and binary-compatible. It is also strictly safer +than the original function because the unsafe buffer access through raw pointer +is replaced with a safe ``std::span`` access no matter how it's called. However, +because it requires the caller to pass the pointer and the size separately, +it violates our "bounds information continuity" principle. This means that +the callers who care about bounds safety needs to be encouraged to use the +``std::span``-based overload instead. Luckily, the attribute +``[[clang::unsafe_buffer_usage]]`` causes a ``-Wunsafe-buffer-usage`` warning +to be displayed at every call site of the compatibility overload in order to +remind the callers to update their code:: + + void use_last_element() { + std::vector vec { 1, 2, 3 }; + + // no warning + int x = get_last_element(vec); + + // warning: this overload introduces unsafe buffer manipulation + int x = get_last_element(vec.data(), vec.size()); + } + +The compatibility overload can be further simplified with the help of the +``unsafe_forge_span()`` wrapper as described in the previous section -- +and it even makes the pragmas unnecessary:: + + [[clang::unsafe_buffer_usage]] // Please use the new function. + int get_last_element(int *pointer, size_t size) { + // Avoid code duplication - simply invoke the safe function! + return get_last_element(unsafe_forge_span(pointer, size)); + } + +Notice how the attribute ``[[clang::unsafe_buffer_usage]]`` does **not** +suppress the warnings within the function on its own. Similarly, functions whose +entire definitions are covered by ``#pragma clang unsafe_buffer_usage`` do +**not** become automatically annotated with the attribute +``[[clang::unsafe_buffer_usage]]``. They serve two different purposes: + +- The pragma says that the function isn't safely **written**; +- The attribute says that the function isn't safe to **use**. + +Also notice how we've made an **unsafe** wrapper for a **safe** function. +This is significantly better than making a **safe** wrapper for an **unsafe** +function. In other words, the following solution is significantly more unsafe +and undesirable than the previous solution:: + + int get_last_element(std::span sp) { + // You've just added that attribute, and now you need to + // immediately suppress the warning that comes with it? + #pragma clang unsafe_buffer_usage begin + return get_last_element(sp.data(), sp.size()); + #pragma clang unsafe_buffer_usage end + } + + + [[clang::unsafe_buffer_usage]] + int get_last_element(int *pointer, size_t size) { + // This access is still completely unchecked. What's the point of having + // perfect bounds information if you aren't performing runtime checks? + #pragma clang unsafe_buffer_usage begin + return ptr[sz - 1]; + #pragma clang unsafe_buffer_usage end + } + +**Structs and classes**, unlike functions, cannot be overloaded. If a struct +contains an unsafe buffer (in the form of a nested array or a pointer/size pair) +then it is typically impossible to replace them with a safe container (such as +``std::array`` or ``std::span`` respectively) without breaking the layout +of the struct and introducing both source and binary incompatibilities with +the surrounding client code. + +Additionally, member variables of a class cannot be naturally "hidden" from +client code. If a class needs to be used by clients who haven't updated to +C++20 yet, you cannot use the C++20-specific ``std::span`` as a member variable +type. If the definition of a struct is shared with plain C code that manipulates +member variables directly, you cannot use any C++-specific types for these +member variables. + +In such cases there's usually no backwards-compatible way to use safe types +directly. The best option is usually to discourage the clients from using +member variables directly by annotating the member variables with the attribute +``[[clang::unsafe_buffer_usage]]``, and then to change the interface +of the class to provide safe "accessors" to the unsafe data. + +For example, let's assume the worst-case scenario: ``struct foo`` is an unsafe +struct type fully defined in a header shared between plain C code and C++ code:: + + struct foo { + int *pointer; + size_t size; + }; + +In this case you can achieve safety in C++ code by annotating the member +variables as unsafe and encapsulating them into safe accessor methods:: + + struct foo { + [[clang::unsafe_buffer_usage]] + int *pointer; + [[clang::unsafe_buffer_usage]] + size_t size; + + // Avoid showing this code to clients who are unable to digest it. + #if __cplusplus >= 202002L + std::span get_pointer_as_span() { + #pragma clang unsafe_buffer_usage begin + return std::span(pointer, size); + #pragma clang unsafe_buffer_usage end + } + + void set_pointer_from_span(std::span sp) { + #pragma clang unsafe_buffer_usage begin + pointer = sp.data(); + size = sp.size(); + #pragma clang unsafe_buffer_usage end + } + + // Potentially more utility functions. + #endif + }; + +Future Work +=========== + +The ``-Wunsafe-buffer-usage`` technology is in active development. The warning +is largely ready for everyday use but it is continuously improved to reduce +unnecessary noise as well as cover some of the trickier unsafe operations. + +Fix-It Hints for ``-Wunsafe-buffer-usage`` +------------------------------------------ + +A code transformation tool is in development that can semi-automatically +transform large bodies of code to follow the C++ Safe Buffers programming model. +It can currently be accessed by passing the experimental flag +``-fsafe-buffer-usage-suggestions`` in addition to ``-Wunsafe-buffer-usage``. + +Fixits produced this way currently assume the default approach described +in this document as they suggest standard containers and views (most notably +``std::span`` and ``std::array``) as replacements for raw buffer pointers. +This also additionally requires libc++ hardening in order to make the runtime +bounds checks actually happen. + +Static Analysis to Identify Suspicious Sources of Bounds Information +-------------------------------------------------------------------- + +The unsafe constructor ``span(pointer, size)`` is often a necessary evil +when it comes to interoperation with unsafe code. However, passing the +correct bounds information to such constructor is often difficult. +In order to detect those ``span(target_pointer, source_size)`` anti-patterns, +path-sensitive analysis performed by `the clang static analyzer +`_ can be taught to identify situations +when the pointer and the size are coming from "suspiciously different" sources. + +Such analysis will be able to identify the source of information with +significantly higher precision than that of the compiler, making it much better +at identifying incorrect bounds information in your code while producing +significantly fewer warnings. It will also need to bypass +``#pragma clang unsafe_buffer_usage`` suppressions and "see through" +unsafe wrappers such as ``unsafe_forge_span`` -- something that +the static analyzer is naturally capable of doing. diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 81264428c72ed1d9b5284cd5f8a38e4701bd88e7..58dbd686a6dc9fb60eec488a991f553bdb663ca7 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -3371,12 +3371,23 @@ Checks for overlap in two buffer arguments. Applies to: ``memcpy, mempcpy, wmem alpha.unix.cstring.NotNullTerminated (C) """""""""""""""""""""""""""""""""""""""" -Check for arguments which are not null-terminated strings; applies to: ``strlen, strnlen, strcpy, strncpy, strcat, strncat, wcslen, wcsnlen``. +Check for arguments which are not null-terminated strings; +applies to the ``strlen``, ``strcpy``, ``strcat``, ``strcmp`` family of functions. + +Only very fundamental cases are detected where the passed memory block is +absolutely different from a null-terminated string. This checker does not +find if a memory buffer is passed where the terminating zero character +is missing. .. code-block:: c - void test() { - int y = strlen((char *)&test); // warn + void test1() { + int l = strlen((char *)&test); // warn + } + + void test2() { + label: + int l = strlen((char *)&&label); // warn } .. _alpha-unix-cstring-OutOfBounds: diff --git a/clang/docs/index.rst b/clang/docs/index.rst index f4fdc93290a0d964fff4db38bebffbc52facd88b..0f6fb36c4d3352b9f9db1b39c9ab94f56730dc00 100644 --- a/clang/docs/index.rst +++ b/clang/docs/index.rst @@ -25,6 +25,7 @@ Using Clang as a Compiler CrossCompilation ClangStaticAnalyzer ThreadSafetyAnalysis + SafeBuffers DataFlowAnalysisIntro AddressSanitizer ThreadSanitizer diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index 141f58c4600af0be3c555d8c92add43d86597f00..0f0c0bf6e4ef4f5c2adbc88c4cca201ef7c0efd5 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -2085,7 +2085,11 @@ public: class ClassTemplatePartialSpecializationDecl : public ClassTemplateSpecializationDecl { /// The list of template parameters - TemplateParameterList* TemplateParams = nullptr; + TemplateParameterList *TemplateParams = nullptr; + + /// The set of "injected" template arguments used within this + /// partial specialization. + TemplateArgument *InjectedArgs = nullptr; /// The class template partial specialization from which this /// class template partial specialization was instantiated. @@ -2132,6 +2136,10 @@ public: return TemplateParams; } + /// Retrieve the template arguments list of the template parameter list + /// of this template. + ArrayRef getInjectedTemplateArgs(); + /// \brief All associated constraints of this partial specialization, /// including the requires clause and any constraints derived from /// constrained-parameters. @@ -2856,6 +2864,10 @@ class VarTemplatePartialSpecializationDecl /// The list of template parameters TemplateParameterList *TemplateParams = nullptr; + /// The set of "injected" template arguments used within this + /// partial specialization. + TemplateArgument *InjectedArgs = nullptr; + /// The variable template partial specialization from which this /// variable template partial specialization was instantiated. /// @@ -2902,6 +2914,10 @@ public: return TemplateParams; } + /// Retrieve the template arguments list of the template parameter list + /// of this template. + ArrayRef getInjectedTemplateArgs(); + /// \brief All associated constraints of this partial specialization, /// including the requires clause and any constraints derived from /// constrained-parameters. diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index cbe62411d11bffe543b582e397a0494e23c80bd8..466c65a9685ad32cf96905591f01bb8a728344f3 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -6777,6 +6777,17 @@ public: getOp() <= AO__opencl_atomic_store; } + bool isHIP() const { + return Op >= AO__hip_atomic_compare_exchange_strong && + Op <= AO__hip_atomic_store; + } + + /// Return true if atomics operations targeting allocations in private memory + /// are undefined. + bool threadPrivateMemoryAtomicsAreUndefined() const { + return isOpenCL() || isHIP(); + } + SourceLocation getBuiltinLoc() const { return BuiltinLoc; } SourceLocation getRParenLoc() const { return RParenLoc; } diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 9f979218c8a1c1166baf8ea6b0724f170bede085..b57946b454df18e32ba9bb7cfab3a0c4d3f40096 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2662,6 +2662,7 @@ public: #include "clang/Basic/HLSLIntangibleTypes.def" bool isHLSLSpecificType() const; // Any HLSL specific type bool isHLSLIntangibleType() const; // Any HLSL intangible type + bool isHLSLAttributedResourceType() const; /// Determines if this type, which must satisfy /// isObjCLifetimeType(), is implicitly __unsafe_unretained rather @@ -6273,6 +6274,14 @@ public: : ResourceClass(ResourceClass), IsROV(IsROV), RawBuffer(RawBuffer) {} Attributes() : Attributes(llvm::dxil::ResourceClass::UAV, false, false) {} + + friend bool operator==(const Attributes &LHS, const Attributes &RHS) { + return std::tie(LHS.ResourceClass, LHS.IsROV, LHS.RawBuffer) == + std::tie(RHS.ResourceClass, RHS.IsROV, RHS.RawBuffer); + } + friend bool operator!=(const Attributes &LHS, const Attributes &RHS) { + return !(LHS == RHS); + } }; private: @@ -6282,9 +6291,9 @@ private: QualType ContainedType; const Attributes Attrs; - HLSLAttributedResourceType(QualType Canon, QualType Wrapped, - QualType Contained, const Attributes &Attrs) - : Type(HLSLAttributedResource, Canon, + HLSLAttributedResourceType(QualType Wrapped, QualType Contained, + const Attributes &Attrs) + : Type(HLSLAttributedResource, QualType(), Contained.isNull() ? TypeDependence::None : Contained->getDependence()), WrappedType(Wrapped), ContainedType(Contained), Attrs(Attrs) {} @@ -6292,10 +6301,11 @@ private: public: QualType getWrappedType() const { return WrappedType; } QualType getContainedType() const { return ContainedType; } + bool hasContainedType() const { return !ContainedType.isNull(); } const Attributes &getAttrs() const { return Attrs; } - bool isSugared() const { return true; } - QualType desugar() const { return getWrappedType(); } + bool isSugared() const { return false; } + QualType desugar() const { return QualType(this, 0); } void Profile(llvm::FoldingSetNodeID &ID) { Profile(ID, WrappedType, ContainedType, Attrs); @@ -6313,6 +6323,10 @@ public: static bool classof(const Type *T) { return T->getTypeClass() == HLSLAttributedResource; } + + // Returns handle type from HLSL resource, if the type is a resource + static const HLSLAttributedResourceType * + findHandleTypeOnResource(const Type *RT); }; class TemplateTypeParmType : public Type, public llvm::FoldingSetNode { @@ -8439,17 +8453,19 @@ inline bool Type::isOpenCLSpecificType() const { } #include "clang/Basic/HLSLIntangibleTypes.def" -inline bool Type::isHLSLSpecificType() const { +inline bool Type::isHLSLIntangibleType() const { #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) is##Id##Type() || return #include "clang/Basic/HLSLIntangibleTypes.def" - false; // end boolean or operation + isHLSLAttributedResourceType(); } -inline bool Type::isHLSLIntangibleType() const { - // All HLSL specific types are currently intangible type as well, but that - // might change in the future. - return isHLSLSpecificType(); +inline bool Type::isHLSLSpecificType() const { + return isHLSLIntangibleType() || isa(this); +} + +inline bool Type::isHLSLAttributedResourceType() const { + return isa(this); } inline bool Type::isTemplateTypeParmType() const { diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h index f1c72efc238784b116e85cdeb7708aef0e2240c4..54e484d41fb1c3eb6b5cb88001a47a6a4cf3cce6 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchers.h +++ b/clang/include/clang/ASTMatchers/ASTMatchers.h @@ -4197,9 +4197,9 @@ AST_MATCHER_P_OVERLOAD(QualType, references, internal::Matcher, /// \endcode /// cxxMemberCallExpr(onImplicitObjectArgument(hasType( /// cxxRecordDecl(hasName("Y"))))) -/// matches `y.m()`, `x.m()` and (g()).m(), but not `x.g()`. +/// matches `y.m()`, `x.m()` and (`g()).m()`, but not `x.g()`). /// cxxMemberCallExpr(on(callExpr())) -/// does not match `(g()).m()`, because the parens are not ignored. +/// only matches `(g()).m()` (the parens are ignored). /// /// FIXME: Overload to allow directly matching types? AST_MATCHER_P(CXXMemberCallExpr, onImplicitObjectArgument, diff --git a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h new file mode 100644 index 0000000000000000000000000000000000000000..3402d105746e88f3f8ccd41b1089c76be2fd2022 --- /dev/null +++ b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h @@ -0,0 +1,217 @@ +//===-- CachedConstAccessorsLattice.h ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the lattice mixin that additionally maintains a cache of +// stable method call return values to model const accessor member functions. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H +#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H + +#include "clang/AST/Expr.h" +#include "clang/Analysis/FlowSensitive/DataflowEnvironment.h" +#include "clang/Analysis/FlowSensitive/DataflowLattice.h" +#include "clang/Analysis/FlowSensitive/StorageLocation.h" +#include "clang/Analysis/FlowSensitive/Value.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLFunctionalExtras.h" + +namespace clang { +namespace dataflow { + +/// A mixin for a lattice that additionally maintains a cache of stable method +/// call return values to model const accessors methods. When a non-const method +/// is called, the cache should be cleared causing the next call to a const +/// method to be considered a different value. NOTE: The user is responsible for +/// clearing the cache. +/// +/// For example: +/// +/// class Bar { +/// public: +/// const std::optional& getFoo() const; +/// void clear(); +/// }; +// +/// void func(Bar& s) { +/// if (s.getFoo().has_value()) { +/// use(s.getFoo().value()); // safe (checked earlier getFoo()) +/// s.clear(); +/// use(s.getFoo().value()); // unsafe (invalidate cache for s) +/// } +/// } +template class CachedConstAccessorsLattice : public Base { +public: + using Base::Base; // inherit all constructors + + /// Creates or returns a previously created `Value` associated with a const + /// method call `obj.getFoo()` where `RecordLoc` is the + /// `RecordStorageLocation` of `obj`. + /// Returns nullptr if unable to find or create a value. + /// + /// Requirements: + /// + /// - `CE` should return a value (not a reference or record type) + Value * + getOrCreateConstMethodReturnValue(const RecordStorageLocation &RecordLoc, + const CallExpr *CE, Environment &Env); + + /// Creates or returns a previously created `StorageLocation` associated with + /// a const method call `obj.getFoo()` where `RecordLoc` is the + /// `RecordStorageLocation` of `obj`. + /// + /// The callback `Initialize` runs on the storage location if newly created. + /// Returns nullptr if unable to find or create a value. + /// + /// Requirements: + /// + /// - `CE` should return a location (GLValue or a record type). + StorageLocation *getOrCreateConstMethodReturnStorageLocation( + const RecordStorageLocation &RecordLoc, const CallExpr *CE, + Environment &Env, llvm::function_ref Initialize); + + void clearConstMethodReturnValues(const RecordStorageLocation &RecordLoc) { + ConstMethodReturnValues.erase(&RecordLoc); + } + + void clearConstMethodReturnStorageLocations( + const RecordStorageLocation &RecordLoc) { + ConstMethodReturnStorageLocations.erase(&RecordLoc); + } + + bool operator==(const CachedConstAccessorsLattice &Other) const { + return Base::operator==(Other); + } + + LatticeJoinEffect join(const CachedConstAccessorsLattice &Other); + +private: + // Maps a record storage location and const method to the value to return + // from that const method. + using ConstMethodReturnValuesType = + llvm::SmallDenseMap>; + ConstMethodReturnValuesType ConstMethodReturnValues; + + // Maps a record storage location and const method to the record storage + // location to return from that const method. + using ConstMethodReturnStorageLocationsType = llvm::SmallDenseMap< + const RecordStorageLocation *, + llvm::SmallDenseMap>; + ConstMethodReturnStorageLocationsType ConstMethodReturnStorageLocations; +}; + +namespace internal { + +template +llvm::SmallDenseMap> +joinConstMethodMap( + const llvm::SmallDenseMap> + &Map1, + const llvm::SmallDenseMap> + &Map2, + LatticeEffect &Effect) { + llvm::SmallDenseMap> + Result; + for (auto &[Loc, DeclToT] : Map1) { + auto It = Map2.find(Loc); + if (It == Map2.end()) { + Effect = LatticeJoinEffect::Changed; + continue; + } + const auto &OtherDeclToT = It->second; + auto &JoinedDeclToT = Result[Loc]; + for (auto [Func, Var] : DeclToT) { + T *OtherVar = OtherDeclToT.lookup(Func); + if (OtherVar == nullptr || OtherVar != Var) { + Effect = LatticeJoinEffect::Changed; + continue; + } + JoinedDeclToT.insert({Func, Var}); + } + } + return Result; +} + +} // namespace internal + +template +LatticeEffect CachedConstAccessorsLattice::join( + const CachedConstAccessorsLattice &Other) { + + LatticeEffect Effect = Base::join(Other); + + // For simplicity, we only retain values that are identical, but not ones that + // are non-identical but equivalent. This is likely to be sufficient in + // practice, and it reduces implementation complexity considerably. + + ConstMethodReturnValues = internal::joinConstMethodMap( + ConstMethodReturnValues, Other.ConstMethodReturnValues, Effect); + + ConstMethodReturnStorageLocations = + internal::joinConstMethodMap( + ConstMethodReturnStorageLocations, + Other.ConstMethodReturnStorageLocations, Effect); + + return Effect; +} + +template +Value *CachedConstAccessorsLattice::getOrCreateConstMethodReturnValue( + const RecordStorageLocation &RecordLoc, const CallExpr *CE, + Environment &Env) { + QualType Type = CE->getType(); + assert(!Type.isNull()); + assert(!Type->isReferenceType()); + assert(!Type->isRecordType()); + + auto &ObjMap = ConstMethodReturnValues[&RecordLoc]; + const FunctionDecl *DirectCallee = CE->getDirectCallee(); + if (DirectCallee == nullptr) + return nullptr; + auto it = ObjMap.find(DirectCallee); + if (it != ObjMap.end()) + return it->second; + + Value *Val = Env.createValue(Type); + if (Val != nullptr) + ObjMap.insert({DirectCallee, Val}); + return Val; +} + +template +StorageLocation * +CachedConstAccessorsLattice::getOrCreateConstMethodReturnStorageLocation( + const RecordStorageLocation &RecordLoc, const CallExpr *CE, + Environment &Env, llvm::function_ref Initialize) { + assert(!CE->getType().isNull()); + assert(CE->isGLValue() || CE->getType()->isRecordType()); + auto &ObjMap = ConstMethodReturnStorageLocations[&RecordLoc]; + const FunctionDecl *DirectCallee = CE->getDirectCallee(); + if (DirectCallee == nullptr) + return nullptr; + auto it = ObjMap.find(DirectCallee); + if (it != ObjMap.end()) + return it->second; + + StorageLocation &Loc = + Env.createStorageLocation(CE->getType().getNonReferenceType()); + Initialize(Loc); + + ObjMap.insert({DirectCallee, &Loc}); + return &Loc; +} + +} // namespace dataflow +} // namespace clang + +#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def index 55ed9c36f6c5cd3ae14646564225f25d77f08288..72df1e35aaec20ccd142406871d10f0415f40da5 100644 --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -115,6 +115,9 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1) +// This is a 8 bits opaque type. +SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t", SveMFloat8, SveMFloat8Ty, 16, 8, 1, false) + // // x2 // diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 011635d1425e6e10107cde9c5f817f2a05a00792..3d0ee01870bd67696f99288b21a9a823e176132a 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4593,6 +4593,31 @@ def HLSLResourceBinding: InheritableAttr { let LangOpts = [HLSL]; let Args = [StringArgument<"Slot">, StringArgument<"Space", 1>]; let Documentation = [HLSLResourceBindingDocs]; + let AdditionalMembers = [{ + public: + enum class RegisterType : unsigned { SRV, UAV, CBuffer, Sampler, C, I }; + + private: + RegisterType RegType; + unsigned SlotNumber; + unsigned SpaceNumber; + + public: + void setBinding(RegisterType RT, unsigned SlotNum, unsigned SpaceNum) { + RegType = RT; + SlotNumber = SlotNum; + SpaceNumber = SpaceNum; + } + RegisterType getRegisterType() const { + return RegType; + } + unsigned getSlotNumber() const { + return SlotNumber; + } + unsigned getSpaceNumber() const { + return SpaceNumber; + } + }]; } def HLSLPackOffset: HLSLAnnotationAttr { diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index bda8a48be92bda81fe596f6410f651e9b8733947..382fb6b7a3c031a75ca09ada21c984a910b53e46 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4761,6 +4761,12 @@ def HLSLWaveIsFirstLane : LangBuiltin<"HLSL_LANG"> { let Prototype = "bool()"; } +def HLSLWaveReadLaneAt : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_wave_read_lane_at"]; + let Attributes = [NoThrow, Const]; + let Prototype = "void(...)"; +} + def HLSLClamp : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_elementwise_clamp"]; let Attributes = [NoThrow, Const]; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index d084603e54612467ac18eeaff0cd5e4f1ffc83a5..1fb70cf9482cc27b4c43afc93ed227c5c0cee07e 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -2182,10 +2182,10 @@ def err_covariant_return_incomplete : Error< def err_covariant_return_type_different_qualifications : Error< "return type of virtual function %0 is not covariant with the return type of " "the function it overrides (%1 has different qualifiers than %2)">; -def err_covariant_return_type_class_type_more_qualified : Error< +def err_covariant_return_type_class_type_not_same_or_less_qualified : Error< "return type of virtual function %0 is not covariant with the return type of " - "the function it overrides (class type %1 is more qualified than class " - "type %2">; + "the function it overrides (class type %1 does not have the same " + "cv-qualification as or less cv-qualification than class type %2)">; // C++ implicit special member functions def note_in_declaration_of_implicit_special_member : Note< @@ -9230,6 +9230,8 @@ def err_typecheck_cond_incompatible_operands : Error< def err_typecheck_expect_scalar_or_vector : Error< "invalid operand of type %0 where %1 or " "a vector of such type is required">; +def err_typecheck_expect_any_scalar_or_vector : Error< + "invalid operand of type %0 where a scalar or vector is required">; def err_typecheck_expect_flt_or_vector : Error< "invalid operand of type %0 where floating, complex or " "a vector of such types is required">; @@ -10124,6 +10126,8 @@ def note_lambda_capture_initializer : Note< " via initialization of lambda capture %0}1">; def note_init_with_default_member_initializer : Note< "initializing field %0 with default member initializer">; +def note_init_with_default_argument : Note< + "initializing parameter %0 with default argument">; // Check for initializing a member variable with the address or a reference to // a constructor parameter. @@ -12521,7 +12525,7 @@ def warn_unsafe_buffer_variable : Warning< InGroup, DefaultIgnore; def warn_unsafe_buffer_operation : Warning< "%select{unsafe pointer operation|unsafe pointer arithmetic|" - "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|" + "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of %1|" "field %1 prone to unsafe buffer manipulation}0">, InGroup, DefaultIgnore; def warn_unsafe_buffer_libc_call : Warning< diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index e2d4206c72cc96548efe9af2d3081b152deba5be..949c8f5d448bcf890f40f82fe75d099d15dca975 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -698,6 +698,14 @@ public: return ConvergentFunctions; } + /// Return true if atomicrmw operations targeting allocations in private + /// memory are undefined. + bool threadPrivateMemoryAtomicsAreUndefined() const { + // Should be false for OpenMP. + // TODO: Should this be true for SYCL? + return OpenCL || CUDA; + } + /// Return the OpenCL C or C++ version as a VersionTuple. VersionTuple getOpenCLVersionTuple() const; diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td index 8cca392cddc174ecc7d1a75d1c49ab4f6b4ce0a9..7e550ca2992f35e732103d3e967e1599f38104d6 100644 --- a/clang/include/clang/Basic/TypeNodes.td +++ b/clang/include/clang/Basic/TypeNodes.td @@ -93,7 +93,7 @@ def EnumType : TypeNode, LeafType; def ElaboratedType : TypeNode, NeverCanonical; def AttributedType : TypeNode, NeverCanonical; def BTFTagAttributedType : TypeNode, NeverCanonical; -def HLSLAttributedResourceType : TypeNode, NeverCanonical; +def HLSLAttributedResourceType : TypeNode; def TemplateTypeParmType : TypeNode, AlwaysDependent, LeafType; def SubstTemplateTypeParmType : TypeNode, NeverCanonical; def SubstTemplateTypeParmPackType : TypeNode, AlwaysDependent; diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 8652b5e3a9c90114a7604e1d98d1016f6be7c1f2..ec829f566ef5fc84384c26271b35fec3b78e6743 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1968,13 +1968,16 @@ let TargetGuard = "v8.3a,neon" in { def VCADDQ_ROT90 : SInst<"vcaddq_rot90", "QQQ", "f">; def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">; - defm VCMLA_F32 : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">; + defm VCMLA_F32 : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">; } let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a,neon" in { def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">; def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">; - defm VCMLA_FP64 : VCMLA_ROTS<"d", "uint64x2_t", "uint64x2_t">; + def VCMLAQ_FP64 : SInst<"vcmlaq", "QQQQ", "d">; + def VCMLAQ_ROT90_FP64 : SInst<"vcmlaq_rot90", "QQQQ", "d">; + def VCMLAQ_ROT180_FP64 : SInst<"vcmlaq_rot180", "QQQQ", "d">; + def VCMLAQ_ROT270_FP64 : SInst<"vcmlaq_rot270", "QQQQ", "d">; } // V8.2-A BFloat intrinsics diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index fdf4ba55fe9382dfdaecdcbf511d17d42b60f9aa..50911fb63e818e5f83f7f8b6238c2ea2c4786be0 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -162,6 +162,7 @@ def EltTyBool16 : EltType<10>; def EltTyBool32 : EltType<11>; def EltTyBool64 : EltType<12>; def EltTyBFloat16 : EltType<13>; +def EltTyMFloat8 : EltType<14>; class MemEltType { int Value = val; diff --git a/clang/include/clang/CodeGen/CodeGenABITypes.h b/clang/include/clang/CodeGen/CodeGenABITypes.h index 9cbc5a8a2a3f41526da0e6b21528cd90fbcfcfe6..836fdd75477c7644200ad41d651e3cf14bd8b198 100644 --- a/clang/include/clang/CodeGen/CodeGenABITypes.h +++ b/clang/include/clang/CodeGen/CodeGenABITypes.h @@ -75,11 +75,25 @@ const CGFunctionInfo &arrangeCXXMethodType(CodeGenModule &CGM, const FunctionProtoType *FTP, const CXXMethodDecl *MD); -const CGFunctionInfo &arrangeFreeFunctionCall(CodeGenModule &CGM, - CanQualType returnType, - ArrayRef argTypes, - FunctionType::ExtInfo info, - RequiredArgs args); +const CGFunctionInfo & +arrangeCXXMethodCall(CodeGenModule &CGM, CanQualType returnType, + ArrayRef argTypes, FunctionType::ExtInfo info, + ArrayRef paramInfos, + RequiredArgs args); + +const CGFunctionInfo &arrangeFreeFunctionCall( + CodeGenModule &CGM, CanQualType returnType, ArrayRef argTypes, + FunctionType::ExtInfo info, + ArrayRef paramInfos, + RequiredArgs args); + +// An overload with an empty `paramInfos` +inline const CGFunctionInfo & +arrangeFreeFunctionCall(CodeGenModule &CGM, CanQualType returnType, + ArrayRef argTypes, + FunctionType::ExtInfo info, RequiredArgs args) { + return arrangeFreeFunctionCall(CGM, returnType, argTypes, info, {}, args); +} /// Returns the implicit arguments to add to a complete, non-delegating C++ /// constructor call. diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h index 1404e168684821e12ba396a0e1bae95ace34dfcd..b4d485dac8a269dab8f888202b62284773e6c939 100644 --- a/clang/include/clang/Driver/Distro.h +++ b/clang/include/clang/Driver/Distro.h @@ -80,6 +80,7 @@ public: UbuntuMantic, UbuntuNoble, UbuntuOracular, + UbuntuPlucky, UnknownDistro }; @@ -131,7 +132,7 @@ public: } bool IsUbuntu() const { - return DistroVal >= UbuntuHardy && DistroVal <= UbuntuOracular; + return DistroVal >= UbuntuHardy && DistroVal <= UbuntuPlucky; } bool IsAlpineLinux() const { return DistroVal == AlpineLinux; } diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 5c2204a3c4d3180ed0c41e735969da32709a4d76..9f90b39063121f2462d1473e2e5f78509b0e2254 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5391,9 +5391,18 @@ def mlasx : Flag<["-"], "mlasx">, Group, HelpText<"Enable Loongson Advanced SIMD Extension (LASX).">; def mno_lasx : Flag<["-"], "mno-lasx">, Group, HelpText<"Disable Loongson Advanced SIMD Extension (LASX).">; +let Flags = [TargetSpecific] in { def msimd_EQ : Joined<["-"], "msimd=">, Group, - Flags<[TargetSpecific]>, HelpText<"Select the SIMD extension(s) to be enabled in LoongArch either 'none', 'lsx', 'lasx'.">; +def mfrecipe : Flag<["-"], "mfrecipe">, Group, + HelpText<"Enable frecipe.{s/d} and frsqrte.{s/d}">; +def mno_frecipe : Flag<["-"], "mno-frecipe">, Group, + HelpText<"Disable frecipe.{s/d} and frsqrte.{s/d}">; +def mannotate_tablejump : Flag<["-"], "mannotate-tablejump">, Group, + HelpText<"Enable annotate table jump instruction to correlate it with the jump table.">; +def mno_annotate_tablejump : Flag<["-"], "mno-annotate-tablejump">, Group, + HelpText<"Disable annotate table jump instruction to correlate it with the jump table.">; +} // let Flags = [TargetSpecific] def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">, Visibility<[ClangOption, CC1Option]>, Group, MarshallingInfoFlag>; @@ -8536,7 +8545,7 @@ def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">, HelpText<"Set runtime encoding, supports only UTF-8">, Alias; def _SLASH_std : CLCompileJoined<"std:">, - HelpText<"Set language version (c++14,c++17,c++20,c++latest,c11,c17)">; + HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">; def _SLASH_U : CLJoinedOrSeparate<"U">, HelpText<"Undefine macro">, MetaVarName<"">, Alias; def _SLASH_validate_charset : CLFlag<"validate-charset">, diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h index 4f34fcc575e80712fb79513c2ce4f7d33cfc85c1..c30e6fac66d6ba22f55eaa8098d9523dcf61e6c9 100644 --- a/clang/include/clang/ExtractAPI/API.h +++ b/clang/include/clang/ExtractAPI/API.h @@ -26,6 +26,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" #include #include @@ -615,7 +616,24 @@ struct TagRecord : APIRecord, RecordContext { return classofKind(Record->getKind()); } static bool classofKind(RecordKind K) { - return K == RK_Struct || K == RK_Union || K == RK_Enum; + switch (K) { + case RK_Enum: + LLVM_FALLTHROUGH; + case RK_Struct: + LLVM_FALLTHROUGH; + case RK_Union: + LLVM_FALLTHROUGH; + case RK_CXXClass: + LLVM_FALLTHROUGH; + case RK_ClassTemplate: + LLVM_FALLTHROUGH; + case RK_ClassTemplateSpecialization: + LLVM_FALLTHROUGH; + case RK_ClassTemplatePartialSpecialization: + return true; + default: + return false; + } } bool IsEmbeddedInVarDeclarator; @@ -684,7 +702,22 @@ struct RecordRecord : TagRecord { return classofKind(Record->getKind()); } static bool classofKind(RecordKind K) { - return K == RK_Struct || K == RK_Union; + switch (K) { + case RK_Struct: + LLVM_FALLTHROUGH; + case RK_Union: + LLVM_FALLTHROUGH; + case RK_CXXClass: + LLVM_FALLTHROUGH; + case RK_ClassTemplate: + LLVM_FALLTHROUGH; + case RK_ClassTemplateSpecialization: + LLVM_FALLTHROUGH; + case RK_ClassTemplatePartialSpecialization: + return true; + default: + return false; + } } bool isAnonymousWithNoTypedef() { return Name.empty(); } diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index a0762b088b68ef3d94e1415b742370a4b2bd5103..debba1c78228398cafdf5ae4cc66dc37f195e026 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -3938,6 +3938,29 @@ struct FormatStyle { /// \version 14 bool RemoveBracesLLVM; + /// Remove empty lines within unwrapped lines. + /// \code + /// false: true: + /// + /// int c vs. int c = a + b; + /// + /// = a + b; + /// + /// enum : unsigned vs. enum : unsigned { + /// AA = 0, + /// { BB + /// AA = 0, } myEnum; + /// BB + /// } myEnum; + /// + /// while ( vs. while (true) { + /// } + /// true) { + /// } + /// \endcode + /// \version 20 + bool RemoveEmptyLinesInUnwrappedLines; + /// Types of redundant parentheses to remove. enum RemoveParenthesesStyle : int8_t { /// Do not remove parentheses. @@ -5232,6 +5255,8 @@ struct FormatStyle { RawStringFormats == R.RawStringFormats && ReferenceAlignment == R.ReferenceAlignment && RemoveBracesLLVM == R.RemoveBracesLLVM && + RemoveEmptyLinesInUnwrappedLines == + R.RemoveEmptyLinesInUnwrappedLines && RemoveParentheses == R.RemoveParentheses && RemoveSemicolon == R.RemoveSemicolon && RequiresClausePosition == R.RequiresClausePosition && diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 0faa5aed4eec3bc8bb36d4a6f83694179304f5a9..2c5769f8469e5bbdc61ac9676310c3ca473adb96 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -6755,7 +6755,7 @@ public: ExprResult BuildPredefinedExpr(SourceLocation Loc, PredefinedIdentKind IK); ExprResult ActOnPredefinedExpr(SourceLocation Loc, tok::TokenKind Kind); - ExprResult ActOnIntegerConstant(SourceLocation Loc, uint64_t Val); + ExprResult ActOnIntegerConstant(SourceLocation Loc, int64_t Val); bool CheckLoopHintExpr(Expr *E, SourceLocation Loc, bool AllowZero); diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index fa957abc9791af2c1b041e07907a18b7251a5755..4f1fc9a31404c6278dd2dcfd8104d9cd25a2ef47 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -28,6 +28,9 @@ class AttributeCommonInfo; class IdentifierInfo; class ParsedAttr; class Scope; +class VarDecl; + +using llvm::dxil::ResourceClass; // FIXME: This can be hidden (as static function in SemaHLSL.cpp) once we no // longer need to create builtin buffer types in HLSLExternalSemaSource. @@ -35,6 +38,50 @@ bool CreateHLSLAttributedResourceType( Sema &S, QualType Wrapped, ArrayRef AttrList, QualType &ResType, HLSLAttributedResourceLocInfo *LocInfo = nullptr); +enum class BindingType : uint8_t { NotAssigned, Explicit, Implicit }; + +// DeclBindingInfo struct stores information about required/assigned resource +// binding onon a declaration for specific resource class. +struct DeclBindingInfo { + const VarDecl *Decl; + ResourceClass ResClass; + const HLSLResourceBindingAttr *Attr; + BindingType BindType; + + DeclBindingInfo(const VarDecl *Decl, ResourceClass ResClass, + BindingType BindType = BindingType::NotAssigned, + const HLSLResourceBindingAttr *Attr = nullptr) + : Decl(Decl), ResClass(ResClass), Attr(Attr), BindType(BindType) {} + + void setBindingAttribute(HLSLResourceBindingAttr *A, BindingType BT) { + assert(Attr == nullptr && BindType == BindingType::NotAssigned && + "binding attribute already assigned"); + Attr = A; + BindType = BT; + } +}; + +// ResourceBindings class stores information about all resource bindings +// in a shader. It is used for binding diagnostics and implicit binding +// assigments. +class ResourceBindings { +public: + DeclBindingInfo *addDeclBindingInfo(const VarDecl *VD, + ResourceClass ResClass); + DeclBindingInfo *getDeclBindingInfo(const VarDecl *VD, + ResourceClass ResClass); + bool hasBindingInfoForDecl(const VarDecl *VD) const; + +private: + // List of all resource bindings required by the shader. + // A global declaration can have multiple bindings for different + // resource classes. They are all stored sequentially in this list. + // The DeclToBindingListIndex hashtable maps a declaration to the + // index of the first binding info in the list. + llvm::SmallVector BindingsList; + llvm::DenseMap DeclToBindingListIndex; +}; + class SemaHLSL : public SemaBase { public: SemaHLSL(Sema &S); @@ -55,6 +102,7 @@ public: mergeParamModifierAttr(Decl *D, const AttributeCommonInfo &AL, HLSLParamModifierAttr::Spelling Spelling); void ActOnTopLevelFunction(FunctionDecl *FD); + void ActOnVariableDeclarator(VarDecl *VD); void CheckEntryPoint(FunctionDecl *FD); void CheckSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param, const HLSLAnnotationAttr *AnnotationAttr); @@ -102,6 +150,15 @@ private: llvm::DenseMap LocsForHLSLAttributedResources; + + // List of all resource bindings + ResourceBindings Bindings; + +private: + void collectResourcesOnVarDecl(VarDecl *D); + void collectResourcesOnUserRecordDecl(const VarDecl *VD, + const RecordType *RT); + void processExplicitBindingsOnDecl(VarDecl *D); }; } // namespace clang diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 202bc4d237c8fca7f08994297d97b54a047d5a1c..e665e5144df6157ee5e9e04a43191913bb974bbe 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1149,7 +1149,7 @@ enum PredefinedTypeIDs { /// /// Type IDs for non-predefined types will start at /// NUM_PREDEF_TYPE_IDs. -const unsigned NUM_PREDEF_TYPE_IDS = 505; +const unsigned NUM_PREDEF_TYPE_IDS = 506; // Ensure we do not overrun the predefined types we reserved // in the enum PredefinedTypeIDs above. diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def index 737bc8e86cfb6aaaf085b1bb4903b9d9f6f23d9b..ad2dbffe88326d41ac2868734903eaad9bcf438e 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def @@ -299,13 +299,12 @@ ANALYZER_OPTION( ANALYZER_OPTION( bool, ShouldEagerlyAssume, "eagerly-assume", - "Whether we should eagerly assume evaluations of conditionals, thus, " - "bifurcating the path. This indicates how the engine should handle " - "expressions such as: 'x = (y != 0)'. When this is true then the " - "subexpression 'y != 0' will be eagerly assumed to be true or false, thus " - "evaluating it to the integers 0 or 1 respectively. The upside is that " - "this can increase analysis precision until we have a better way to lazily " - "evaluate such logic. The downside is that it eagerly bifurcates paths.", + "If this is enabled (the default behavior), when the analyzer encounters " + "a comparison operator or logical negation, it immediately splits the " + "state to separate the case when the expression is true and the case when " + "it's false. The upside is that this can increase analysis precision until " + "we have a better way to lazily evaluate such logic; the downside is that " + "it eagerly bifurcates paths.", true) ANALYZER_OPTION( diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h index 3a3c1a13d67dd554bc39684ceeb8b6898073c5c5..2f4cd277cccdc6cf2c7c3598d55f95e4b5fe3bf3 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h @@ -229,8 +229,6 @@ public: unsigned AnalyzerDisplayProgress : 1; unsigned AnalyzerNoteAnalysisEntryPoints : 1; - unsigned eagerlyAssumeBinOpBifurcation : 1; - unsigned TrimGraph : 1; unsigned visualizeExplodedGraphWithGraphViz : 1; unsigned UnoptimizedCFG : 1; @@ -293,9 +291,9 @@ public: ShowConfigOptionsList(false), ShouldEmitErrorsOnInvalidConfigValue(false), AnalyzeAll(false), AnalyzerDisplayProgress(false), AnalyzerNoteAnalysisEntryPoints(false), - eagerlyAssumeBinOpBifurcation(false), TrimGraph(false), - visualizeExplodedGraphWithGraphViz(false), UnoptimizedCFG(false), - PrintStats(false), NoRetryExhausted(false), AnalyzerWerror(false) {} + TrimGraph(false), visualizeExplodedGraphWithGraphViz(false), + UnoptimizedCFG(false), PrintStats(false), NoRetryExhausted(false), + AnalyzerWerror(false) {} /// Interprets an option's string value as a boolean. The "true" string is /// interpreted as true and the "false" string is interpreted as false. diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h index 04eacd1df7ffe23f2736a2c610821781cbe67421..8c7493e27fcaa633d4dad6c13bed34bc7cb732bd 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h @@ -583,14 +583,13 @@ public: ExplodedNode *Pred, ExplodedNodeSet &Dst); - /// evalEagerlyAssumeBinOpBifurcation - Given the nodes in 'Src', eagerly assume symbolic - /// expressions of the form 'x != 0' and generate new nodes (stored in Dst) - /// with those assumptions. - void evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src, - const Expr *Ex); + /// evalEagerlyAssumeBifurcation - Given the nodes in 'Src', eagerly assume + /// concrete boolean values for 'Ex', storing the resulting nodes in 'Dst'. + void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src, + const Expr *Ex); static std::pair - geteagerlyAssumeBinOpBifurcationTags(); + getEagerlyAssumeBifurcationTags(); ProgramStateRef handleLValueBitCast(ProgramStateRef state, const Expr *Ex, const LocationContext *LCtx, QualType T, diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index b9f08bf8e6c5dd94c1b189e050c0dcd21471ede0..a3d77f501ffb0f983f23f5bf86544a7993ad8e58 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -3447,6 +3447,9 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx, OS << II->getLength() << II->getName(); return; } + case Type::HLSLAttributedResource: + llvm_unreachable("should never get here"); + break; case Type::DeducedTemplateSpecialization: case Type::Auto: #define NON_CANONICAL_TYPE(Class, Base) case Type::Class: @@ -4118,6 +4121,7 @@ QualType ASTContext::getVariableArrayDecayedType(QualType type) const { case Type::BitInt: case Type::DependentBitInt: case Type::ArrayParameter: + case Type::HLSLAttributedResource: llvm_unreachable("type should never be variably-modified"); // These types can be variably-modified but should never need to @@ -5293,9 +5297,8 @@ QualType ASTContext::getHLSLAttributedResourceType( if (Ty) return QualType(Ty, 0); - QualType Canon = getCanonicalType(Wrapped); Ty = new (*this, alignof(HLSLAttributedResourceType)) - HLSLAttributedResourceType(Canon, Wrapped, Contained, Attrs); + HLSLAttributedResourceType(Wrapped, Contained, Attrs); Types.push_back(Ty); HLSLAttributedResourceTypes.InsertNode(Ty, InsertPos); @@ -9166,6 +9169,9 @@ void ASTContext::getObjCEncodingForTypeImpl(QualType T, std::string &S, case Type::DeducedTemplateSpecialization: return; + case Type::HLSLAttributedResource: + llvm_unreachable("unexpected type"); + case Type::ArrayParameter: case Type::Pipe: #define ABSTRACT_TYPE(KIND, BASE) @@ -11593,6 +11599,20 @@ QualType ASTContext::mergeTypes(QualType LHS, QualType RHS, bool OfBlockPointer, return {}; return LHS; } + case Type::HLSLAttributedResource: { + const HLSLAttributedResourceType *LHSTy = + LHS->castAs(); + const HLSLAttributedResourceType *RHSTy = + RHS->castAs(); + assert(LHSTy->getWrappedType() == RHSTy->getWrappedType() && + LHSTy->getWrappedType()->isHLSLResourceType() && + "HLSLAttributedResourceType should always wrap __hlsl_resource_t"); + + if (LHSTy->getAttrs() == RHSTy->getAttrs() && + LHSTy->getContainedType() == RHSTy->getContainedType()) + return LHS; + return {}; + } } llvm_unreachable("Invalid Type::Class!"); @@ -13456,6 +13476,7 @@ static QualType getCommonNonSugarTypeNode(ASTContext &Ctx, const Type *X, SUGAR_FREE_TYPE(Record) SUGAR_FREE_TYPE(SubstTemplateTypeParmPack) SUGAR_FREE_TYPE(UnresolvedUsing) + SUGAR_FREE_TYPE(HLSLAttributedResource) #undef SUGAR_FREE_TYPE #define NON_UNIQUE_TYPE(Class) UNEXPECTED_TYPE(Class, "non-unique") NON_UNIQUE_TYPE(TypeOfExpr) diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index 21f0562f9d72ae408e77237747ab1dc387fc74b0..120ddc0f26c0d7951b23aa41ff936daeafb760f6 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -802,16 +802,6 @@ static bool IsEquivalentExceptionSpec(StructuralEquivalenceContext &Context, return true; } -// Determine structural equivalence of two instances of -// HLSLAttributedResourceType::Attributes -static bool -IsStructurallyEquivalent(StructuralEquivalenceContext &Context, - const HLSLAttributedResourceType::Attributes &Attrs1, - const HLSLAttributedResourceType::Attributes &Attrs2) { - return std::tie(Attrs1.ResourceClass, Attrs1.IsROV, Attrs1.RawBuffer) == - std::tie(Attrs2.ResourceClass, Attrs2.IsROV, Attrs2.RawBuffer); -} - /// Determine structural equivalence of two types. static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, QualType T1, QualType T2) { @@ -1115,9 +1105,8 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, Context, cast(T1)->getContainedType(), cast(T2)->getContainedType())) return false; - if (!IsStructurallyEquivalent( - Context, cast(T1)->getAttrs(), - cast(T2)->getAttrs())) + if (cast(T1)->getAttrs() != + cast(T2)->getAttrs()) return false; break; diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h index e06ec1669259da46032a5e767e832cb390d71654..be537d22d5af1b5588d6f60b48685a8423b31408 100644 --- a/clang/lib/AST/ByteCode/Integral.h +++ b/clang/lib/AST/ByteCode/Integral.h @@ -122,11 +122,14 @@ public: APSInt toAPSInt() const { return APSInt(APInt(Bits, static_cast(V), Signed), !Signed); } - APSInt toAPSInt(unsigned NumBits) const { + APSInt toAPSInt(unsigned BitWidth) const { return APSInt(toAPInt(BitWidth)); } + APInt toAPInt(unsigned BitWidth) const { if constexpr (Signed) - return APSInt(toAPSInt().sextOrTrunc(NumBits), !Signed); + return APInt(Bits, static_cast(V), Signed) + .sextOrTrunc(BitWidth); else - return APSInt(toAPSInt().zextOrTrunc(NumBits), !Signed); + return APInt(Bits, static_cast(V), Signed) + .zextOrTrunc(BitWidth); } APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); } diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h index a4d656433344b79b8b9a12a42c0e5700b0329dd3..f8aeaaca398fe87911665d549efc3675410bded4 100644 --- a/clang/lib/AST/ByteCode/IntegralAP.h +++ b/clang/lib/AST/ByteCode/IntegralAP.h @@ -61,7 +61,7 @@ public: IntegralAP(APInt V) : V(V) {} /// Arbitrary value for uninitialized variables. - IntegralAP() : IntegralAP(-1, 3) {} + IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {} IntegralAP operator-() const { return IntegralAP(-V); } IntegralAP operator-(const IntegralAP &Other) const { @@ -112,9 +112,7 @@ public: template static IntegralAP from(Integral I, unsigned BitWidth) { - APInt Copy = APInt(BitWidth, static_cast(I), InputSigned); - - return IntegralAP(Copy); + return IntegralAP(I.toAPInt(BitWidth)); } static IntegralAP zero(int32_t BitWidth) { diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 40137de19c4e1b2d81c3d26c9f512985969d3dbf..fdc4b38b8aa6dcda53805bd02dd0140fababb22a 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1040,7 +1040,6 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm, return nullptr; }; - AllocType->dump(); if (const FunctionDecl *VirtualDelete = getVirtualOperatorDelete(AllocType); VirtualDelete && diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index dece95971b761707d863eb202e70ea6f1881f902..f034bde309035f035ad894480e752b171e20d137 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1841,6 +1841,7 @@ bool Init(InterpState &S, CodePtr OpPC) { assert(false); return false; } + Ptr.activate(); Ptr.initialize(); new (&Ptr.deref()) T(Value); return true; @@ -1852,6 +1853,7 @@ bool InitPop(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.pop(); if (!CheckInit(S, OpPC, Ptr)) return false; + Ptr.activate(); Ptr.initialize(); new (&Ptr.deref()) T(Value); return true; @@ -1863,13 +1865,24 @@ bool InitPop(InterpState &S, CodePtr OpPC) { template ::T> bool InitElem(InterpState &S, CodePtr OpPC, uint32_t Idx) { const T &Value = S.Stk.pop(); - const Pointer &Ptr = S.Stk.peek().atIndex(Idx); + const Pointer &Ptr = S.Stk.peek(); + if (Ptr.isUnknownSizeArray()) return false; - if (!CheckInit(S, OpPC, Ptr)) + + // In the unlikely event that we're initializing the first item of + // a non-array, skip the atIndex(). + if (Idx == 0 && !Ptr.getFieldDesc()->isArray()) { + Ptr.initialize(); + new (&Ptr.deref()) T(Value); + return true; + } + + const Pointer &ElemPtr = Ptr.atIndex(Idx); + if (!CheckInit(S, OpPC, ElemPtr)) return false; - Ptr.initialize(); - new (&Ptr.deref()) T(Value); + ElemPtr.initialize(); + new (&ElemPtr.deref()) T(Value); return true; } @@ -1877,13 +1890,23 @@ bool InitElem(InterpState &S, CodePtr OpPC, uint32_t Idx) { template ::T> bool InitElemPop(InterpState &S, CodePtr OpPC, uint32_t Idx) { const T &Value = S.Stk.pop(); - const Pointer &Ptr = S.Stk.pop().atIndex(Idx); + const Pointer &Ptr = S.Stk.pop(); if (Ptr.isUnknownSizeArray()) return false; - if (!CheckInit(S, OpPC, Ptr)) + + // In the unlikely event that we're initializing the first item of + // a non-array, skip the atIndex(). + if (Idx == 0 && !Ptr.getFieldDesc()->isArray()) { + Ptr.initialize(); + new (&Ptr.deref()) T(Value); + return true; + } + + const Pointer &ElemPtr = Ptr.atIndex(Idx); + if (!CheckInit(S, OpPC, ElemPtr)) return false; - Ptr.initialize(); - new (&Ptr.deref()) T(Value); + ElemPtr.initialize(); + new (&ElemPtr.deref()) T(Value); return true; } diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 407ec14bbc00d554048f3b4880247b37c6308032..08615d4393f5d132a48fb570a12a6bc20d62bbfc 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -1411,10 +1411,10 @@ void CXXRecordDecl::addedMember(Decl *D) { Ty = Ty->getArrayElementTypeNoTypeQual(); Ty = Ty->getUnqualifiedDesugaredType(); - if (Ty->isBuiltinType()) - data().IsHLSLIntangible |= Ty->isHLSLIntangibleType(); - else if (const RecordType *RT = dyn_cast(Ty)) + if (const RecordType *RT = dyn_cast(Ty)) data().IsHLSLIntangible |= RT->getAsCXXRecordDecl()->isHLSLIntangible(); + else + data().IsHLSLIntangible |= Ty->isHLSLIntangibleType(); } } diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index d9b67b7bedf5a5d9a800ffbc4752b0d47e877da9..d2d8907b884ec88386a8a602b54446fb96b08a40 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -1185,6 +1185,20 @@ SourceRange ClassTemplatePartialSpecializationDecl::getSourceRange() const { return Range; } +ArrayRef +ClassTemplatePartialSpecializationDecl::getInjectedTemplateArgs() { + TemplateParameterList *Params = getTemplateParameters(); + auto *First = cast(getFirstDecl()); + if (!First->InjectedArgs) { + auto &Context = getASTContext(); + SmallVector TemplateArgs; + Context.getInjectedTemplateArgs(Params, TemplateArgs); + First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()]; + std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs); + } + return llvm::ArrayRef(First->InjectedArgs, Params->size()); +} + //===----------------------------------------------------------------------===// // FriendTemplateDecl Implementation //===----------------------------------------------------------------------===// @@ -1535,6 +1549,20 @@ SourceRange VarTemplatePartialSpecializationDecl::getSourceRange() const { return Range; } +ArrayRef +VarTemplatePartialSpecializationDecl::getInjectedTemplateArgs() { + TemplateParameterList *Params = getTemplateParameters(); + auto *First = cast(getFirstDecl()); + if (!First->InjectedArgs) { + auto &Context = getASTContext(); + SmallVector TemplateArgs; + Context.getInjectedTemplateArgs(Params, TemplateArgs); + First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()]; + std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs); + } + return llvm::ArrayRef(First->InjectedArgs, Params->size()); +} + static TemplateParameterList * createMakeIntegerSeqParameterList(const ASTContext &C, DeclContext *DC) { // typename T diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 51956c631786b533c7ca87c9a4aa01cafbebdf7e..8544052d5e4924d48a67e18223525d751cab053b 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -7237,6 +7237,7 @@ class APValueToBufferConverter { case APValue::ComplexInt: case APValue::ComplexFloat: + return visitComplex(Val, Ty, Offset); case APValue::FixedPoint: // FIXME: We should support these. @@ -7323,6 +7324,31 @@ class APValueToBufferConverter { return true; } + bool visitComplex(const APValue &Val, QualType Ty, CharUnits Offset) { + const ComplexType *ComplexTy = Ty->castAs(); + QualType EltTy = ComplexTy->getElementType(); + CharUnits EltSizeChars = Info.Ctx.getTypeSizeInChars(EltTy); + bool IsInt = Val.isComplexInt(); + + if (IsInt) { + if (!visitInt(Val.getComplexIntReal(), EltTy, + Offset + (0 * EltSizeChars))) + return false; + if (!visitInt(Val.getComplexIntImag(), EltTy, + Offset + (1 * EltSizeChars))) + return false; + } else { + if (!visitFloat(Val.getComplexFloatReal(), EltTy, + Offset + (0 * EltSizeChars))) + return false; + if (!visitFloat(Val.getComplexFloatImag(), EltTy, + Offset + (1 * EltSizeChars))) + return false; + } + + return true; + } + bool visitVector(const APValue &Val, QualType Ty, CharUnits Offset) { const VectorType *VTy = Ty->castAs(); QualType EltTy = VTy->getElementType(); @@ -7595,6 +7621,23 @@ class BufferToAPValueConverter { return ArrayValue; } + std::optional visit(const ComplexType *Ty, CharUnits Offset) { + QualType ElementType = Ty->getElementType(); + CharUnits ElementWidth = Info.Ctx.getTypeSizeInChars(ElementType); + bool IsInt = ElementType->isIntegerType(); + + std::optional Values[2]; + for (unsigned I = 0; I != 2; ++I) { + Values[I] = visitType(Ty->getElementType(), Offset + I * ElementWidth); + if (!Values[I]) + return std::nullopt; + } + + if (IsInt) + return APValue(Values[0]->getInt(), Values[1]->getInt()); + return APValue(Values[0]->getFloat(), Values[1]->getFloat()); + } + std::optional visit(const VectorType *VTy, CharUnits Offset) { QualType EltTy = VTy->getElementType(); unsigned NElts = VTy->getNumElements(); @@ -12167,6 +12210,7 @@ GCCTypeClass EvaluateBuiltinClassifyType(QualType T, case Type::ObjCInterface: case Type::ObjCObjectPointer: case Type::Pipe: + case Type::HLSLAttributedResource: // Classify all other types that don't fit into the regular // classification the same way. return GCCTypeClass::None; diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 777cdca1a0c0d766e4c63572c5b5159ccd3d5043..d3ed35deb2b1d255f063109a99222e253a93b952 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -4512,6 +4512,38 @@ void CXXNameMangler::mangleType(const ArrayParameterType *T) { mangleType(cast(T)); } +void CXXNameMangler::mangleType(const HLSLAttributedResourceType *T) { + llvm::SmallString<64> Str("_Res"); + const HLSLAttributedResourceType::Attributes &Attrs = T->getAttrs(); + // map resource class to HLSL virtual register letter + switch (Attrs.ResourceClass) { + case llvm::dxil::ResourceClass::UAV: + Str += "_u"; + break; + case llvm::dxil::ResourceClass::SRV: + Str += "_t"; + break; + case llvm::dxil::ResourceClass::CBuffer: + Str += "_b"; + break; + case llvm::dxil::ResourceClass::Sampler: + Str += "_s"; + break; + } + if (Attrs.IsROV) + Str += "_ROV"; + if (Attrs.RawBuffer) + Str += "_Raw"; + if (T->hasContainedType()) + Str += "_CT"; + mangleVendorQualifier(Str); + + if (T->hasContainedType()) { + mangleType(T->getContainedType()); + } + mangleType(T->getWrappedType()); +} + void CXXNameMangler::mangleIntegerLiteral(QualType T, const llvm::APSInt &Value) { // ::= L E # integer literal diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 4ccf3f76bf0ce2eab0afd13e11229bf1097014d5..3931fcaa3529610ae5b0c63e1c17b919a8328834 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -3754,6 +3754,11 @@ void MicrosoftCXXNameMangler::mangleType(const DependentBitIntType *T, Error(Range.getBegin(), "DependentBitInt type") << Range; } +void MicrosoftCXXNameMangler::mangleType(const HLSLAttributedResourceType *T, + Qualifiers, SourceRange Range) { + llvm_unreachable("HLSL uses Itanium name mangling"); +} + // ::= | | // // ::= A # private near diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 6f4958801cfe82b9f5026442cb2682a441e4103b..5232efae4e36300204732c8b9720143b1847368c 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2525,6 +2525,7 @@ bool Type::isSveVLSBuiltinType() const { case BuiltinType::SveBool: case BuiltinType::SveBoolx2: case BuiltinType::SveBoolx4: + case BuiltinType::SveMFloat8: return true; default: return false; @@ -4575,6 +4576,8 @@ static CachedProperties computeCachedProperties(const Type *T) { return Cache::get(cast(T)->getValueType()); case Type::Pipe: return Cache::get(cast(T)->getElementType()); + case Type::HLSLAttributedResource: + return Cache::get(cast(T)->getWrappedType()); } llvm_unreachable("unhandled type class"); @@ -4664,6 +4667,8 @@ LinkageInfo LinkageComputer::computeTypeLinkageInfo(const Type *T) { return computeTypeLinkageInfo(cast(T)->getValueType()); case Type::Pipe: return computeTypeLinkageInfo(cast(T)->getElementType()); + case Type::HLSLAttributedResource: + llvm_unreachable("not yet implemented"); } llvm_unreachable("unhandled type class"); @@ -4846,6 +4851,7 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const { case Type::BitInt: case Type::DependentBitInt: case Type::ArrayParameter: + case Type::HLSLAttributedResource: return false; } llvm_unreachable("bad type kind!"); @@ -5329,3 +5335,18 @@ std::string FunctionEffectWithCondition::description() const { Result += "(expr)"; return Result; } + +const HLSLAttributedResourceType * +HLSLAttributedResourceType::findHandleTypeOnResource(const Type *RT) { + // If the type RT is an HLSL resource class, the first field must + // be the resource handle of type HLSLAttributedResourceType + const clang::Type *Ty = RT->getUnqualifiedDesugaredType(); + if (const RecordDecl *RD = Ty->getAsCXXRecordDecl()) { + if (!RD->fields().empty()) { + const auto &FirstFD = RD->fields().begin(); + return dyn_cast( + FirstFD->getType().getTypePtr()); + } + } + return nullptr; +} diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index 97f1c4f16b8f4cb0f804d34adcf9d11701086149..5e0ec9ecc92ea4332eaffeb6793947bc721d732b 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -1499,8 +1499,11 @@ public: } static Matcher matcher() { - Matcher callExpr = cxxMemberCallExpr( - callee(cxxMethodDecl(hasName("data"), ofClass(hasName("std::span"))))); + + Matcher callExpr = cxxMemberCallExpr(callee( + cxxMethodDecl(hasName("data"), + ofClass(anyOf(hasName("std::span"), hasName("std::array"), + hasName("std::vector")))))); return stmt( explicitCastExpr(anyOf(has(callExpr), has(parenExpr(has(callExpr))))) .bind(OpTag)); diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 61889861c9c803a558d97376ebcf53ebbf78a49a..b96fab978a3fcb989d409ed8b83ea1031ff9a5c2 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -143,6 +143,8 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple, IntMaxType = SignedLong; } + AddrSpaceMap = &ARM64AddrSpaceMap; + // All AArch64 implementations support ARMv8 FP, which makes half a legal type. HasLegalHalfType = true; HalfArgsAndReturns = true; @@ -1533,11 +1535,16 @@ AArch64leTargetInfo::AArch64leTargetInfo(const llvm::Triple &Triple, void AArch64leTargetInfo::setDataLayout() { if (getTriple().isOSBinFormatMachO()) { if(getTriple().isArch32Bit()) - resetDataLayout("e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32", "_"); + resetDataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-" + "i128:128-n32:64-S128-Fn32", + "_"); else - resetDataLayout("e-m:o-i64:64-i128:128-n32:64-S128-Fn32", "_"); + resetDataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-" + "n32:64-S128-Fn32", + "_"); } else - resetDataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"); + resetDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-" + "i64:64-i128:128-n32:64-S128-Fn32"); } void AArch64leTargetInfo::getTargetDefines(const LangOptions &Opts, @@ -1560,7 +1567,8 @@ void AArch64beTargetInfo::getTargetDefines(const LangOptions &Opts, void AArch64beTargetInfo::setDataLayout() { assert(!getTriple().isOSBinFormatMachO()); - resetDataLayout("E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"); + resetDataLayout("E-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-" + "i64:64-i128:128-n32:64-S128-Fn32"); } WindowsARM64TargetInfo::WindowsARM64TargetInfo(const llvm::Triple &Triple, @@ -1583,8 +1591,10 @@ WindowsARM64TargetInfo::WindowsARM64TargetInfo(const llvm::Triple &Triple, void WindowsARM64TargetInfo::setDataLayout() { resetDataLayout(Triple.isOSBinFormatMachO() - ? "e-m:o-i64:64-i128:128-n32:64-S128-Fn32" - : "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32", + ? "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:" + "128-n32:64-S128-Fn32" + : "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-" + "i64:64-i128:128-n32:64-S128-Fn32", Triple.isOSBinFormatMachO() ? "_" : ""); } diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 1226ce4d4355c20d735dbac46e35a0cd33120f7f..16a02e102e045d644d2b26e3ecd30663f6b7e317 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -21,6 +21,34 @@ namespace clang { namespace targets { +enum AArch64AddrSpace { ptr32_sptr = 270, ptr32_uptr = 271, ptr64 = 272 }; + +static const unsigned ARM64AddrSpaceMap[] = { + 0, // Default + 0, // opencl_global + 0, // opencl_local + 0, // opencl_constant + 0, // opencl_private + 0, // opencl_generic + 0, // opencl_global_device + 0, // opencl_global_host + 0, // cuda_device + 0, // cuda_constant + 0, // cuda_shared + 0, // sycl_global + 0, // sycl_global_device + 0, // sycl_global_host + 0, // sycl_local + 0, // sycl_private + static_cast(AArch64AddrSpace::ptr32_sptr), + static_cast(AArch64AddrSpace::ptr32_uptr), + static_cast(AArch64AddrSpace::ptr64), + 0, // hlsl_groupshared + // Wasm address space values for this target are dummy values, + // as it is only enabled for Wasm targets. + 20, // wasm_funcref +}; + class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { virtual void setDataLayout() = 0; static const TargetInfo::GCCRegAlias GCCRegAliases[]; @@ -207,6 +235,18 @@ public: bool validateGlobalRegisterVariable(StringRef RegName, unsigned RegSize, bool &HasSizeMismatch) const override; + + uint64_t getPointerWidthV(LangAS AddrSpace) const override { + if (AddrSpace == LangAS::ptr32_sptr || AddrSpace == LangAS::ptr32_uptr) + return 32; + if (AddrSpace == LangAS::ptr64) + return 64; + return PointerWidth; + } + + uint64_t getPointerAlignV(LangAS AddrSpace) const override { + return getPointerWidthV(AddrSpace); + } }; class LLVM_LIBRARY_VISIBILITY AArch64leTargetInfo : public AArch64TargetInfo { diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp index b56e2c7ca9c494aeddb0be59679d5e51fa9e7b5a..88c054150ab224cecaa05bca55a662e9a9f71af1 100644 --- a/clang/lib/Basic/Targets/OSTargets.cpp +++ b/clang/lib/Basic/Targets/OSTargets.cpp @@ -214,9 +214,11 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) { Builder.defineMacro("_HAS_CHAR16_T_LANGUAGE_SUPPORT", Twine(1)); if (Opts.isCompatibleWithMSVC(LangOptions::MSVC2015)) { - if (Opts.CPlusPlus23) + if (Opts.CPlusPlus26) // TODO update to the proper value. - Builder.defineMacro("_MSVC_LANG", "202004L"); + Builder.defineMacro("_MSVC_LANG", "202400L"); + else if (Opts.CPlusPlus23) + Builder.defineMacro("_MSVC_LANG", "202302L"); else if (Opts.CPlusPlus20) Builder.defineMacro("_MSVC_LANG", "202002L"); else if (Opts.CPlusPlus17) diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp index a2a87e012b8b274f6812903d53f361b3e6f17eb6..f8736695acf18772303ea3e218f06ee859e376fe 100644 --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -389,6 +389,7 @@ static void emitAtomicCmpXchg(CodeGenFunction &CGF, AtomicExpr *E, bool IsWeak, Ptr, Expected, Desired, SuccessOrder, FailureOrder, Scope); Pair->setVolatile(E->isVolatile()); Pair->setWeak(IsWeak); + CGF.getTargetHooks().setTargetAtomicMetadata(CGF, *Pair, E); // Cmp holds the result of the compare-exchange operation: true on success, // false on failure. @@ -727,7 +728,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest, llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1); llvm::AtomicRMWInst *RMWI = - CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope); + CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope, E); RMWI->setVolatile(E->isVolatile()); // For __atomic_*_fetch operations, perform the operation again to @@ -2048,11 +2049,11 @@ std::pair CodeGenFunction::EmitAtomicCompareExchange( llvm::AtomicRMWInst * CodeGenFunction::emitAtomicRMWInst(llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val, llvm::AtomicOrdering Order, - llvm::SyncScope::ID SSID) { - + llvm::SyncScope::ID SSID, + const AtomicExpr *AE) { llvm::AtomicRMWInst *RMW = Builder.CreateAtomicRMW(Op, Addr, Val, Order, SSID); - getTargetHooks().setTargetAtomicMetadata(*this, *RMW); + getTargetHooks().setTargetAtomicMetadata(*this, *RMW, AE); return RMW; } diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 6b5514df6632e6fb7dcd0c1af9b19b339e34d865..65fdcdd9281ee3d6dcf7ed8b3682de32c3a487b8 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -1015,6 +1015,24 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type, // Can't find the field referenced by the "counted_by" attribute. return nullptr; + if (isa(Base)) + // The whole struct is specificed in the __bdos. The calculation of the + // whole size of the structure can be done in two ways: + // + // 1) sizeof(struct S) + count * sizeof(typeof(fam)) + // 2) offsetof(struct S, fam) + count * sizeof(typeof(fam)) + // + // The first will add additional padding after the end of the array, + // allocation while the second method is more precise, but not quite + // expected from programmers. See + // https://lore.kernel.org/lkml/ZvV6X5FPBBW7CO1f@archlinux/ for a + // discussion of the topic. + // + // GCC isn't (currently) able to calculate __bdos on a pointer to the whole + // structure. Therefore, because of the above issue, we'll choose to match + // what GCC does for consistency's sake. + return nullptr; + // Build a load of the counted_by field. bool IsSigned = CountedByFD->getType()->isSignedIntegerType(); Value *CountedByInst = EmitLoadOfCountedByField(Base, FAMDecl, CountedByFD); @@ -1045,32 +1063,9 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type, CharUnits Size = Ctx.getTypeSizeInChars(ArrayTy->getElementType()); llvm::Constant *ElemSize = llvm::ConstantInt::get(ResType, Size.getQuantity(), IsSigned); - Value *FAMSize = + Value *Res = Builder.CreateMul(CountedByInst, ElemSize, "", !IsSigned, IsSigned); - FAMSize = Builder.CreateIntCast(FAMSize, ResType, IsSigned); - Value *Res = FAMSize; - - if (isa(Base)) { - // The whole struct is specificed in the __bdos. - const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(OuterRD); - - // Get the offset of the FAM. - llvm::Constant *FAMOffset = ConstantInt::get(ResType, Offset, IsSigned); - Value *OffsetAndFAMSize = - Builder.CreateAdd(FAMOffset, Res, "", !IsSigned, IsSigned); - - // Get the full size of the struct. - llvm::Constant *SizeofStruct = - ConstantInt::get(ResType, Layout.getSize().getQuantity(), IsSigned); - - // max(sizeof(struct s), - // offsetof(struct s, array) + p->count * sizeof(*p->array)) - Res = IsSigned - ? Builder.CreateBinaryIntrinsic(llvm::Intrinsic::smax, - OffsetAndFAMSize, SizeofStruct) - : Builder.CreateBinaryIntrinsic(llvm::Intrinsic::umax, - OffsetAndFAMSize, SizeofStruct); - } + Res = Builder.CreateIntCast(Res, ResType, IsSigned); // A negative \p IdxInst or \p CountedByInst means that the index lands // outside of the flexible array member. If that's the case, we want to @@ -1290,9 +1285,8 @@ static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF, // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0; Value *ByteIndex = CGF.Builder.CreateAShr( BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx"); - Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy); - Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8, - ByteIndex, "bittest.byteaddr"), + Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBase, ByteIndex, + "bittest.byteaddr"), CGF.Int8Ty, CharUnits::One()); Value *PosLow = CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty), @@ -5660,14 +5654,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, *Arg3 = EmitScalarExpr(E->getArg(3)); llvm::FunctionType *FTy = llvm::FunctionType::get( Int32Ty, llvm::ArrayRef(ArgTys), false); - Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy); + Value *ACast = Builder.CreateAddrSpaceCast(Arg3, I8PTy); // We know the third argument is an integer type, but we may need to cast // it to i32. if (Arg2->getType() != Int32Ty) Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty); return RValue::get( EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), - {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign})); + {Arg0, Arg1, Arg2, ACast, PacketSize, PacketAlign})); } } // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write @@ -11319,7 +11313,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Value *Dst = EmitScalarExpr(E->getArg(0)); Value *Val = EmitScalarExpr(E->getArg(1)); Value *Size = EmitScalarExpr(E->getArg(2)); - Dst = Builder.CreatePointerCast(Dst, Int8PtrTy); Val = Builder.CreateTrunc(Val, Int8Ty); Size = Builder.CreateIntCast(Size, Int64Ty, false); return Builder.CreateCall( @@ -11344,34 +11337,27 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, } if (MTEIntrinsicID != Intrinsic::not_intrinsic) { - llvm::Type *T = ConvertType(E->getType()); - if (MTEIntrinsicID == Intrinsic::aarch64_irg) { Value *Pointer = EmitScalarExpr(E->getArg(0)); Value *Mask = EmitScalarExpr(E->getArg(1)); - Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy); Mask = Builder.CreateZExt(Mask, Int64Ty); - Value *RV = Builder.CreateCall( - CGM.getIntrinsic(MTEIntrinsicID), {Pointer, Mask}); - return Builder.CreatePointerCast(RV, T); + return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), + {Pointer, Mask}); } if (MTEIntrinsicID == Intrinsic::aarch64_addg) { Value *Pointer = EmitScalarExpr(E->getArg(0)); Value *TagOffset = EmitScalarExpr(E->getArg(1)); - Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy); TagOffset = Builder.CreateZExt(TagOffset, Int64Ty); - Value *RV = Builder.CreateCall( - CGM.getIntrinsic(MTEIntrinsicID), {Pointer, TagOffset}); - return Builder.CreatePointerCast(RV, T); + return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), + {Pointer, TagOffset}); } if (MTEIntrinsicID == Intrinsic::aarch64_gmi) { Value *Pointer = EmitScalarExpr(E->getArg(0)); Value *ExcludedMask = EmitScalarExpr(E->getArg(1)); ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty); - Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy); return Builder.CreateCall( CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask}); } @@ -11380,25 +11366,20 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, // return address same as input address. if (MTEIntrinsicID == Intrinsic::aarch64_ldg) { Value *TagAddress = EmitScalarExpr(E->getArg(0)); - TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy); - Value *RV = Builder.CreateCall( - CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress}); - return Builder.CreatePointerCast(RV, T); + return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), + {TagAddress, TagAddress}); } // Although it is possible to supply a different tag (to set) // to this intrinsic (as first arg), for now we supply // the tag that is in input address arg (common use case). if (MTEIntrinsicID == Intrinsic::aarch64_stg) { - Value *TagAddress = EmitScalarExpr(E->getArg(0)); - TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy); - return Builder.CreateCall( - CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress}); + Value *TagAddress = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID), + {TagAddress, TagAddress}); } if (MTEIntrinsicID == Intrinsic::aarch64_subp) { Value *PointerA = EmitScalarExpr(E->getArg(0)); Value *PointerB = EmitScalarExpr(E->getArg(1)); - PointerA = Builder.CreatePointerCast(PointerA, Int8PtrTy); - PointerB = Builder.CreatePointerCast(PointerB, Int8PtrTy); return Builder.CreateCall( CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB}); } @@ -18922,6 +18903,24 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: { return EmitRuntimeCall( Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); } + case Builtin::BI__builtin_hlsl_wave_read_lane_at: { + // Due to the use of variadic arguments we must explicitly retreive them and + // create our function type. + Value *OpExpr = EmitScalarExpr(E->getArg(0)); + Value *OpIndex = EmitScalarExpr(E->getArg(1)); + llvm::FunctionType *FT = llvm::FunctionType::get( + OpExpr->getType(), ArrayRef{OpExpr->getType(), OpIndex->getType()}, + false); + + // Get overloaded name + std::string Name = + Intrinsic::getName(CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(), + ArrayRef{OpExpr->getType()}, &CGM.getModule()); + return EmitRuntimeCall(CGM.CreateRuntimeFunction(FT, Name, {}, + /*Local=*/false, + /*AssumeConvergent=*/true), + ArrayRef{OpExpr, OpIndex}, "hlsl.wave.readlane"); + } case Builtin::BI__builtin_hlsl_elementwise_sign: { auto *Arg0 = E->getArg(0); Value *Op0 = EmitScalarExpr(Arg0); diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index 8dcb5f6100619647b8e7e4c7babbbbf94438aab1..b4f1a68cfe87f4b8987dd1914d68d36dd02c1975 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -1121,6 +1121,14 @@ CodeGenFunction::GenerateCXXGlobalInitFunc(llvm::Function *Fn, if (Decls[i]) EmitRuntimeCall(Decls[i]); + if (getLangOpts().HLSL) { + CGHLSLRuntime &CGHLSL = CGM.getHLSLRuntime(); + if (CGHLSL.needsResourceBindingInitFn()) { + llvm::Function *ResInitFn = CGHLSL.createResourceBindingInitFn(); + Builder.CreateCall(llvm::FunctionCallee(ResInitFn), {}); + } + } + Scope.ForceCleanup(); if (ExitBlock) { diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index 1214bb054fb8df402cfb4dc82c98501067e41bd1..648b9b9ed98063d450264e41e42add7410c164ab 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -1771,14 +1771,6 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) { EmitNewInitializer(*this, E, allocType, elementTy, result, numElements, allocSizeWithoutCookie); llvm::Value *resultPtr = result.emitRawPointer(*this); - if (E->isArray()) { - // NewPtr is a pointer to the base element type. If we're - // allocating an array of arrays, we'll need to cast back to the - // array pointer type. - llvm::Type *resultType = ConvertTypeForMem(E->getType()); - if (resultPtr->getType() != resultType) - resultPtr = Builder.CreateBitCast(resultPtr, resultType); - } // Deactivate the 'operator delete' cleanup if we finished // initialization. @@ -1805,7 +1797,7 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) { } void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD, - llvm::Value *Ptr, QualType DeleteTy, + llvm::Value *DeletePtr, QualType DeleteTy, llvm::Value *NumElements, CharUnits CookieSize) { assert((!NumElements && CookieSize.isZero()) || @@ -1819,7 +1811,6 @@ void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD, // Pass the pointer itself. QualType ArgTy = *ParamTypeIt++; - llvm::Value *DeletePtr = Builder.CreateBitCast(Ptr, ConvertType(ArgTy)); DeleteArgs.add(RValue::get(DeletePtr), ArgTy); // Pass the std::destroying_delete tag if present. diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 3237d93ca31ceb38ffefb59623a96ca10e919774..2cce2936fe5aeefa5bcd82a2561dc993a3bb92d9 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -18,8 +18,13 @@ #include "TargetInfo.h" #include "clang/AST/Decl.h" #include "clang/Basic/TargetOptions.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Alignment.h" + #include "llvm/Support/FormatVariadic.h" using namespace clang; @@ -489,3 +494,88 @@ void CGHLSLRuntime::generateGlobalCtorDtorCalls() { GV->eraseFromParent(); } } + +void CGHLSLRuntime::handleGlobalVarDefinition(const VarDecl *VD, + llvm::GlobalVariable *GV) { + // If the global variable has resource binding, add it to the list of globals + // that need resource binding initialization. + const HLSLResourceBindingAttr *RBA = VD->getAttr(); + if (!RBA) + return; + + if (!HLSLAttributedResourceType::findHandleTypeOnResource( + VD->getType().getTypePtr())) + // FIXME: Only simple declarations of resources are supported for now. + // Arrays of resources or resources in user defined classes are + // not implemented yet. + return; + + ResourcesToBind.emplace_back(VD, GV); +} + +bool CGHLSLRuntime::needsResourceBindingInitFn() { + return !ResourcesToBind.empty(); +} + +llvm::Function *CGHLSLRuntime::createResourceBindingInitFn() { + // No resources to bind + assert(needsResourceBindingInitFn() && "no resources to bind"); + + LLVMContext &Ctx = CGM.getLLVMContext(); + llvm::Type *Int1Ty = llvm::Type::getInt1Ty(Ctx); + + llvm::Function *InitResBindingsFunc = + llvm::Function::Create(llvm::FunctionType::get(CGM.VoidTy, false), + llvm::GlobalValue::InternalLinkage, + "_init_resource_bindings", CGM.getModule()); + + llvm::BasicBlock *EntryBB = + llvm::BasicBlock::Create(Ctx, "entry", InitResBindingsFunc); + CGBuilderTy Builder(CGM, Ctx); + const DataLayout &DL = CGM.getModule().getDataLayout(); + Builder.SetInsertPoint(EntryBB); + + for (const auto &[VD, GV] : ResourcesToBind) { + for (Attr *A : VD->getAttrs()) { + HLSLResourceBindingAttr *RBA = dyn_cast(A); + if (!RBA) + continue; + + const HLSLAttributedResourceType *AttrResType = + HLSLAttributedResourceType::findHandleTypeOnResource( + VD->getType().getTypePtr()); + + // FIXME: Only simple declarations of resources are supported for now. + // Arrays of resources or resources in user defined classes are + // not implemented yet. + assert(AttrResType != nullptr && + "Resource class must have a handle of HLSLAttributedResourceType"); + + llvm::Type *TargetTy = + CGM.getTargetCodeGenInfo().getHLSLType(CGM, AttrResType); + assert(TargetTy != nullptr && + "Failed to convert resource handle to target type"); + + auto *Space = llvm::ConstantInt::get(CGM.IntTy, RBA->getSpaceNumber()); + auto *Slot = llvm::ConstantInt::get(CGM.IntTy, RBA->getSlotNumber()); + // FIXME: resource arrays are not yet implemented + auto *Range = llvm::ConstantInt::get(CGM.IntTy, 1); + auto *Index = llvm::ConstantInt::get(CGM.IntTy, 0); + // FIXME: NonUniformResourceIndex bit is not yet implemented + auto *NonUniform = llvm::ConstantInt::get(Int1Ty, false); + llvm::Value *Args[] = {Space, Slot, Range, Index, NonUniform}; + + llvm::Value *CreateHandle = Builder.CreateIntrinsic( + /*ReturnType=*/TargetTy, getCreateHandleFromBindingIntrinsic(), Args, + nullptr, Twine(VD->getName()).concat("_h")); + + llvm::Value *HandleRef = + Builder.CreateStructGEP(GV->getValueType(), GV, 0); + Builder.CreateAlignedStore(CreateHandle, HandleRef, + HandleRef->getPointerAlignment(DL)); + } + } + + Builder.CreateRetVoid(); + return InitResBindingsFunc; +} diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 282fa44af212fbd029ad232d1b9c7892eca84868..ff7df41b5c62e711b474ea65d8bfbdf2043c6adb 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -90,6 +90,9 @@ public: GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot) GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot) GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane) + GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane) + + GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding, handle_fromBinding) //===----------------------------------------------------------------------===// // End of reserved area for HLSL intrinsic getters. @@ -136,6 +139,10 @@ public: void emitEntryFunction(const FunctionDecl *FD, llvm::Function *Fn); void setHLSLFunctionAttributes(const FunctionDecl *FD, llvm::Function *Fn); + void handleGlobalVarDefinition(const VarDecl *VD, llvm::GlobalVariable *Var); + + bool needsResourceBindingInitFn(); + llvm::Function *createResourceBindingInitFn(); private: void addBufferResourceAnnotation(llvm::GlobalVariable *GV, @@ -147,6 +154,9 @@ private: void addBufferDecls(const DeclContext *DC, Buffer &CB); llvm::Triple::ArchType getArch(); llvm::SmallVector Buffers; + + llvm::SmallVector> + ResourcesToBind; }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CGVTT.cpp b/clang/lib/CodeGen/CGVTT.cpp index 20bd2c2fc2c6420357cfbdda6679d4547e5d1fd6..989a07d09d50ee843da3f7f08db5e1c1fee4fcfa 100644 --- a/clang/lib/CodeGen/CGVTT.cpp +++ b/clang/lib/CodeGen/CGVTT.cpp @@ -85,8 +85,9 @@ CodeGenVTables::EmitVTTDefinition(llvm::GlobalVariable *VTT, cast(VTable->getValueType()) ->getElementType(AddressPoint.VTableIndex)); unsigned Offset = ComponentSize * AddressPoint.AddressPointIndex; - llvm::ConstantRange InRange(llvm::APInt(32, -Offset, true), - llvm::APInt(32, VTableSize - Offset, true)); + llvm::ConstantRange InRange( + llvm::APInt(32, (int)-Offset, true), + llvm::APInt(32, (int)(VTableSize - Offset), true)); llvm::Constant *Init = llvm::ConstantExpr::getGetElementPtr( VTable->getValueType(), VTable, Idxs, /*InBounds=*/true, InRange); diff --git a/clang/lib/CodeGen/CodeGenABITypes.cpp b/clang/lib/CodeGen/CodeGenABITypes.cpp index a6073e1188d6fa284085f0cb7f819efca0d796ca..3f10d68f8c5d45fc93b9be59159819c4fabfe989 100644 --- a/clang/lib/CodeGen/CodeGenABITypes.cpp +++ b/clang/lib/CodeGen/CodeGenABITypes.cpp @@ -59,14 +59,23 @@ CodeGen::arrangeCXXMethodType(CodeGenModule &CGM, return CGM.getTypes().arrangeCXXMethodType(RD, FTP, MD); } -const CGFunctionInfo & -CodeGen::arrangeFreeFunctionCall(CodeGenModule &CGM, - CanQualType returnType, - ArrayRef argTypes, - FunctionType::ExtInfo info, - RequiredArgs args) { - return CGM.getTypes().arrangeLLVMFunctionInfo(returnType, FnInfoOpts::None, - argTypes, info, {}, args); +const CGFunctionInfo &CodeGen::arrangeCXXMethodCall( + CodeGenModule &CGM, CanQualType returnType, ArrayRef argTypes, + FunctionType::ExtInfo info, + ArrayRef paramInfos, + RequiredArgs args) { + return CGM.getTypes().arrangeLLVMFunctionInfo( + returnType, FnInfoOpts::IsInstanceMethod, argTypes, info, paramInfos, + args); +} + +const CGFunctionInfo &CodeGen::arrangeFreeFunctionCall( + CodeGenModule &CGM, CanQualType returnType, ArrayRef argTypes, + FunctionType::ExtInfo info, + ArrayRef paramInfos, + RequiredArgs args) { + return CGM.getTypes().arrangeLLVMFunctionInfo( + returnType, FnInfoOpts::None, argTypes, info, paramInfos, args); } ImplicitCXXConstructorArgs diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index f3023c7a20c4050f2a66740b5d747f23a705b48a..2306043c90f406233ef5432c3c98f17890ba63b7 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -282,6 +282,7 @@ TypeEvaluationKind CodeGenFunction::getEvaluationKind(QualType type) { case Type::ObjCObjectPointer: case Type::Pipe: case Type::BitInt: + case Type::HLSLAttributedResource: return TEK_Scalar; // Complexes. diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index cf802f12bfa47f500bba42326c9914b6d963f9e1..b9e28efde81c1a6165fbbec155a41deeec241678 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4166,7 +4166,8 @@ public: llvm::AtomicRMWInst *emitAtomicRMWInst( llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val, llvm::AtomicOrdering Order = llvm::AtomicOrdering::SequentiallyConsistent, - llvm::SyncScope::ID SSID = llvm::SyncScope::System); + llvm::SyncScope::ID SSID = llvm::SyncScope::System, + const AtomicExpr *AE = nullptr); void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO, const llvm::function_ref &UpdateOp, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index b05ab3606a698ba37fdf26a91eabbd3ab388c8f1..9a84a11973b1a93f96160d80f975f20652f81e50 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -295,6 +295,7 @@ createTargetCodeGenInfo(CodeGenModule &CGM) { return createCommonSPIRTargetCodeGenInfo(CGM); case llvm::Triple::spirv32: case llvm::Triple::spirv64: + case llvm::Triple::spirv: return createSPIRVTargetCodeGenInfo(CGM); case llvm::Triple::dxil: return createDirectXTargetCodeGenInfo(CGM); @@ -5633,6 +5634,9 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D, getCUDARuntime().handleVarRegistration(D, *GV); } + if (LangOpts.HLSL) + getHLSLRuntime().handleGlobalVarDefinition(D, GV); + GV->setInitializer(Init); if (emitter) emitter->finalize(GV); diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 82eb7234c98c7af0db3a1eca66960bb5d0a5c7af..9bfb4c7fce37d3a2caab9fc7a2082f0e09ddd9dc 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -750,6 +750,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { ResultType = llvm::Type::getIntNTy(getLLVMContext(), EIT->getNumBits()); break; } + case Type::HLSLAttributedResource: + ResultType = CGM.getHLSLRuntime().convertHLSLSpecificType(Ty); + break; } assert(ResultType && "Didn't convert a type?"); diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 07015834bc84f35f2fe0cf25831ebe75226afaae..577a0f571e16ea81eb14b5cd099306dcd77eff6e 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -2066,7 +2066,7 @@ struct CounterCoverageMappingBuilder GapRegionCounter = OutCount; } - if (!S->isConsteval() && !llvm::EnableSingleByteCoverage) + if (!llvm::EnableSingleByteCoverage) // Create Branch Region around condition. createBranchRegion(S->getCond(), ThenCount, subtractCounters(ParentCount, ThenCount)); diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index 75dab596e1b2c4f69f045819a0d56c5bd8c7034a..89f9457523824a60371689bddc01699967bef449 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -2099,8 +2099,9 @@ ItaniumCXXABI::getVTableAddressPoint(BaseSubobject Base, unsigned VTableSize = ComponentSize * Layout.getVTableSize(AddressPoint.VTableIndex); unsigned Offset = ComponentSize * AddressPoint.AddressPointIndex; - llvm::ConstantRange InRange(llvm::APInt(32, -Offset, true), - llvm::APInt(32, VTableSize - Offset, true)); + llvm::ConstantRange InRange( + llvm::APInt(32, (int)-Offset, true), + llvm::APInt(32, (int)(VTableSize - Offset), true)); return llvm::ConstantExpr::getGetElementPtr( VTable->getValueType(), VTable, Indices, /*InBounds=*/true, InRange); } @@ -3436,7 +3437,7 @@ class ItaniumRTTIBuilder { llvm::Constant *GetAddrOfExternalRTTIDescriptor(QualType Ty); /// BuildVTablePointer - Build the vtable pointer for the given type. - void BuildVTablePointer(const Type *Ty); + void BuildVTablePointer(const Type *Ty, llvm::Constant *StorageAddress); /// BuildSIClassTypeInfo - Build an abi::__si_class_type_info, used for single /// inheritance, according to the Itanium C++ ABI, 2.9.5p6b. @@ -3833,7 +3834,8 @@ static bool CanUseSingleInheritance(const CXXRecordDecl *RD) { return true; } -void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) { +void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty, + llvm::Constant *StorageAddress) { // abi::__class_type_info. static const char * const ClassTypeInfo = "_ZTVN10__cxxabiv117__class_type_infoE"; @@ -3947,6 +3949,9 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) { // abi::__pointer_to_member_type_info. VTableName = "_ZTVN10__cxxabiv129__pointer_to_member_type_infoE"; break; + + case Type::HLSLAttributedResource: + llvm_unreachable("HLSL doesn't support virtual functions"); } llvm::Constant *VTable = nullptr; @@ -3977,9 +3982,12 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) { VTable, Two); } - if (auto &Schema = CGM.getCodeGenOpts().PointerAuth.CXXTypeInfoVTablePointer) - VTable = CGM.getConstantSignedPointer(VTable, Schema, nullptr, GlobalDecl(), - QualType(Ty, 0)); + if (const auto &Schema = + CGM.getCodeGenOpts().PointerAuth.CXXTypeInfoVTablePointer) + VTable = CGM.getConstantSignedPointer( + VTable, Schema, + Schema.isAddressDiscriminated() ? StorageAddress : nullptr, + GlobalDecl(), QualType(Ty, 0)); Fields.push_back(VTable); } @@ -4095,8 +4103,18 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo( llvm::GlobalVariable::LinkageTypes Linkage, llvm::GlobalValue::VisibilityTypes Visibility, llvm::GlobalValue::DLLStorageClassTypes DLLStorageClass) { + SmallString<256> Name; + llvm::raw_svector_ostream Out(Name); + CGM.getCXXABI().getMangleContext().mangleCXXRTTI(Ty, Out); + llvm::Module &M = CGM.getModule(); + llvm::GlobalVariable *OldGV = M.getNamedGlobal(Name); + // int8 is an arbitrary type to be replaced later with replaceInitializer. + llvm::GlobalVariable *GV = + new llvm::GlobalVariable(M, CGM.Int8Ty, /*isConstant=*/true, Linkage, + /*Initializer=*/nullptr, Name); + // Add the vtable pointer. - BuildVTablePointer(cast(Ty)); + BuildVTablePointer(cast(Ty), GV); // And the name. llvm::GlobalVariable *TypeName = GetAddrOfTypeName(Ty, Linkage); @@ -4209,18 +4227,12 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo( case Type::Atomic: // No fields, at least for the moment. break; - } - llvm::Constant *Init = llvm::ConstantStruct::getAnon(Fields); + case Type::HLSLAttributedResource: + llvm_unreachable("HLSL doesn't support RTTI"); + } - SmallString<256> Name; - llvm::raw_svector_ostream Out(Name); - CGM.getCXXABI().getMangleContext().mangleCXXRTTI(Ty, Out); - llvm::Module &M = CGM.getModule(); - llvm::GlobalVariable *OldGV = M.getNamedGlobal(Name); - llvm::GlobalVariable *GV = - new llvm::GlobalVariable(M, Init->getType(), - /*isConstant=*/true, Linkage, Init, Name); + GV->replaceInitializer(llvm::ConstantStruct::getAnon(Fields)); // Export the typeinfo in the same circumstances as the vtable is exported. auto GVDLLStorageClass = DLLStorageClass; diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index 3e503538b2b14d23e9059ec922b789e8a3e11b57..373f8b8a80fdb113d5631e021cf5aeaaecdba846 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -336,7 +336,9 @@ public: /// Allow the target to apply other metadata to an atomic instruction virtual void setTargetAtomicMetadata(CodeGenFunction &CGF, - llvm::AtomicRMWInst &RMW) const {} + llvm::Instruction &AtomicInst, + const AtomicExpr *Expr = nullptr) const { + } /// Interface class for filling custom fields of a block literal for OpenCL. class TargetOpenCLBlockHelper { diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index b852dcffb295c9d4717316605d14eac1fcce4f96..56ad0503a11ab2b7ae372ec94b559fdb54abaead 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -9,6 +9,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" #include "clang/Basic/TargetOptions.h" +#include "llvm/Support/AMDGPUAddrSpace.h" using namespace clang; using namespace clang::CodeGen; @@ -312,7 +313,8 @@ public: llvm::AtomicOrdering Ordering, llvm::LLVMContext &Ctx) const override; void setTargetAtomicMetadata(CodeGenFunction &CGF, - llvm::AtomicRMWInst &RMW) const override; + llvm::Instruction &AtomicInst, + const AtomicExpr *Expr = nullptr) const override; llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, llvm::Function *BlockInvokeFunc, llvm::Type *BlockTy) const override; @@ -546,19 +548,39 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, } void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata( - CodeGenFunction &CGF, llvm::AtomicRMWInst &RMW) const { - if (!CGF.getTarget().allowAMDGPUUnsafeFPAtomics()) + CodeGenFunction &CGF, llvm::Instruction &AtomicInst, + const AtomicExpr *AE) const { + auto *RMW = dyn_cast(&AtomicInst); + auto *CmpX = dyn_cast(&AtomicInst); + + // OpenCL and old style HIP atomics consider atomics targeting thread private + // memory to be undefined. + // + // TODO: This is probably undefined for atomic load/store, but there's not + // much direct codegen benefit to knowing this. + if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) || + (CmpX && + CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) && + AE && AE->threadPrivateMemoryAtomicsAreUndefined()) { + llvm::MDBuilder MDHelper(CGF.getLLVMContext()); + llvm::MDNode *ASRange = MDHelper.createRange( + llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS), + llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1)); + AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange); + } + + if (!RMW || !CGF.getTarget().allowAMDGPUUnsafeFPAtomics()) return; // TODO: Introduce new, more controlled options that also work for integers, // and deprecate allowAMDGPUUnsafeFPAtomics. - llvm::AtomicRMWInst::BinOp RMWOp = RMW.getOperation(); + llvm::AtomicRMWInst::BinOp RMWOp = RMW->getOperation(); if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) { llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {}); - RMW.setMetadata("amdgpu.no.fine.grained.memory", Empty); + RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty); - if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW.getType()->isFloatTy()) - RMW.setMetadata("amdgpu.ignore.denormal.mode", Empty); + if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW->getType()->isFloatTy()) + RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty); } } diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp index 13da2c630629d7cf375a0c18e1fba046796ecb4c..7935f7ae3700476e437e8b703ec38bcb30b5f931 100644 --- a/clang/lib/CodeGen/Targets/DirectX.cpp +++ b/clang/lib/CodeGen/Targets/DirectX.cpp @@ -29,19 +29,41 @@ public: llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(CodeGenModule &CGM, const Type *Ty) const { - auto *BuiltinTy = dyn_cast(Ty); - if (!BuiltinTy || BuiltinTy->getKind() != BuiltinType::HLSLResource) + auto *ResType = dyn_cast(Ty); + if (!ResType) return nullptr; llvm::LLVMContext &Ctx = CGM.getLLVMContext(); - // FIXME: translate __hlsl_resource_t to target("dx.TypedBuffer", <4 x float>, - // 1, 0, 0) only for now (RWBuffer); more work us needed to determine - // the target ext type and its parameters based on the handle type - // attributes (not yet implemented) - llvm::FixedVectorType *ElemType = - llvm::FixedVectorType::get(llvm::Type::getFloatTy(Ctx), 4); - unsigned Flags[] = {/*IsWriteable*/ 1, /*IsROV*/ 0, /*IsSigned*/ 0}; - return llvm::TargetExtType::get(Ctx, "dx.TypedBuffer", {ElemType}, Flags); + const HLSLAttributedResourceType::Attributes &ResAttrs = ResType->getAttrs(); + switch (ResAttrs.ResourceClass) { + case llvm::dxil::ResourceClass::UAV: + case llvm::dxil::ResourceClass::SRV: { + // TypedBuffer and RawBuffer both need element type + QualType ContainedTy = ResType->getContainedType(); + if (ContainedTy.isNull()) + return nullptr; + + // convert element type + llvm::Type *ElemType = CGM.getTypes().ConvertType(ContainedTy); + + llvm::StringRef TypeName = + ResAttrs.RawBuffer ? "dx.RawBuffer" : "dx.TypedBuffer"; + SmallVector Ints = {/*IsWriteable*/ ResAttrs.ResourceClass == + llvm::dxil::ResourceClass::UAV, + /*IsROV*/ ResAttrs.IsROV}; + if (!ResAttrs.RawBuffer) + Ints.push_back(/*IsSigned*/ ContainedTy->isSignedIntegerType()); + + return llvm::TargetExtType::get(Ctx, TypeName, {ElemType}, Ints); + } + case llvm::dxil::ResourceClass::CBuffer: + llvm_unreachable("dx.CBuffer handles are not implemented yet"); + break; + case llvm::dxil::ResourceClass::Sampler: + llvm_unreachable("dx.Sampler handles are not implemented yet"); + break; + } + llvm_unreachable("Unknown llvm::dxil::ResourceClass enum"); } } // namespace diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp index 6f49e641104ccd3b9a533f7483b5f76dad3a9263..3d1bce027f664d9dfd717e29af9641e3400e2652 100644 --- a/clang/lib/Driver/Distro.cpp +++ b/clang/lib/Driver/Distro.cpp @@ -96,6 +96,7 @@ static Distro::DistroType DetectLsbRelease(llvm::vfs::FileSystem &VFS) { .Case("mantic", Distro::UbuntuMantic) .Case("noble", Distro::UbuntuNoble) .Case("oracular", Distro::UbuntuOracular) + .Case("plucky", Distro::UbuntuPlucky) .Default(Distro::UnknownDistro); return Version; } diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index 771adade93813fd516e8957e6b88c213d6917868..355253e4b3b07c0a0ea2522163f4b757aa1a8dba 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -251,6 +251,15 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, } else /*-mno-lasx*/ Features.push_back("-lasx"); } + + // Select frecipe feature determined by -m[no-]frecipe. + if (const Arg *A = + Args.getLastArg(options::OPT_mfrecipe, options::OPT_mno_frecipe)) { + if (A->getOption().matches(options::OPT_mfrecipe)) + Features.push_back("+frecipe"); + else + Features.push_back("-frecipe"); + } } std::string loongarch::postProcessTargetCPUString(const std::string &CPU, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 02b3453a4f6c7361d933814d645b54c99071a4cb..9d8561a656c7b6f18afc3bd45b286e20842a426c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1870,6 +1870,14 @@ void Clang::AddLoongArchTargetArgs(const ArgList &Args, CmdArgs.push_back("-tune-cpu"); CmdArgs.push_back(Args.MakeArgString(TuneCPU)); } + + if (Arg *A = Args.getLastArg(options::OPT_mannotate_tablejump, + options::OPT_mno_annotate_tablejump)) { + if (A->getOption().matches(options::OPT_mannotate_tablejump)) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("-loongarch-annotate-tablejump"); + } + } } void Clang::AddMIPSTargetArgs(const ArgList &Args, @@ -3615,7 +3623,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC, StringRef Value = A->getValue(); if (!EffectiveTriple.isX86() && !EffectiveTriple.isAArch64() && !EffectiveTriple.isARM() && !EffectiveTriple.isThumb() && - !EffectiveTriple.isRISCV()) + !EffectiveTriple.isRISCV() && !EffectiveTriple.isPPC()) D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getAsString(Args) << TripleStr; if ((EffectiveTriple.isX86() || EffectiveTriple.isARM() || @@ -3655,7 +3663,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC, << A->getOption().getName() << Value << "sysreg global"; return; } - if (EffectiveTriple.isRISCV()) { + if (EffectiveTriple.isRISCV() || EffectiveTriple.isPPC()) { if (Value != "tls" && Value != "global") { D.Diag(diag::err_drv_invalid_value_with_suggestion) << A->getOption().getName() << Value << "tls global"; @@ -3676,7 +3684,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC, StringRef Value = A->getValue(); if (!EffectiveTriple.isX86() && !EffectiveTriple.isAArch64() && !EffectiveTriple.isARM() && !EffectiveTriple.isThumb() && - !EffectiveTriple.isRISCV()) + !EffectiveTriple.isRISCV() && !EffectiveTriple.isPPC()) D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getAsString(Args) << TripleStr; int Offset; @@ -3696,7 +3704,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC, if (Arg *A = Args.getLastArg(options::OPT_mstack_protector_guard_reg_EQ)) { StringRef Value = A->getValue(); if (!EffectiveTriple.isX86() && !EffectiveTriple.isAArch64() && - !EffectiveTriple.isRISCV()) + !EffectiveTriple.isRISCV() && !EffectiveTriple.isPPC()) D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getAsString(Args) << TripleStr; if (EffectiveTriple.isX86() && (Value != "fs" && Value != "gs")) { @@ -3713,6 +3721,16 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC, << A->getOption().getName() << Value << "tp"; return; } + if (EffectiveTriple.isPPC64() && Value != "r13") { + D.Diag(diag::err_drv_invalid_value_with_suggestion) + << A->getOption().getName() << Value << "r13"; + return; + } + if (EffectiveTriple.isPPC32() && Value != "r2") { + D.Diag(diag::err_drv_invalid_value_with_suggestion) + << A->getOption().getName() << Value << "r2"; + return; + } A->render(Args, CmdArgs); } @@ -7254,6 +7272,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, .Case("c++17", "-std=c++17") .Case("c++20", "-std=c++20") // TODO add c++23 and c++26 when MSVC supports it. + .Case("c++23preview", "-std=c++23") .Case("c++latest", "-std=c++26") .Default(""); if (LanguageStandard.empty()) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 7841f8b7125236e13437aa3d334652b496de0c51..bbd3ff296c713f70d413f1a581add425ef753d14 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1294,6 +1294,16 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args, CmdArgs.push_back("-lFortranRuntime"); CmdArgs.push_back("-lFortranDecimal"); } + + // libomp needs libatomic for atomic operations if using libgcc + if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, + options::OPT_fno_openmp, false)) { + Driver::OpenMPRuntimeKind OMPRuntime = + TC.getDriver().getOpenMPRuntime(Args); + ToolChain::RuntimeLibType RuntimeLib = TC.GetRuntimeLibType(Args); + if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc) + CmdArgs.push_back("-latomic"); + } } void tools::addFortranRuntimeLibraryPath(const ToolChain &TC, diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 148270795c562f81e4f869f9ddfa512dc5b69a6f..c612960ff37ac80cee4014e95e4bf7a475a796c3 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -1104,6 +1104,8 @@ template <> struct MappingTraits { IO.mapOptional("ReferenceAlignment", Style.ReferenceAlignment); IO.mapOptional("ReflowComments", Style.ReflowComments); IO.mapOptional("RemoveBracesLLVM", Style.RemoveBracesLLVM); + IO.mapOptional("RemoveEmptyLinesInUnwrappedLines", + Style.RemoveEmptyLinesInUnwrappedLines); IO.mapOptional("RemoveParentheses", Style.RemoveParentheses); IO.mapOptional("RemoveSemicolon", Style.RemoveSemicolon); IO.mapOptional("RequiresClausePosition", Style.RequiresClausePosition); @@ -1582,6 +1584,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.ReferenceAlignment = FormatStyle::RAS_Pointer; LLVMStyle.ReflowComments = FormatStyle::RCS_Always; LLVMStyle.RemoveBracesLLVM = false; + LLVMStyle.RemoveEmptyLinesInUnwrappedLines = false; LLVMStyle.RemoveParentheses = FormatStyle::RPS_Leave; LLVMStyle.RemoveSemicolon = false; LLVMStyle.RequiresClausePosition = FormatStyle::RCPS_OwnLine; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index fcefaa7bb298eabd7358749d63a7c1ce772d5e13..13037b6d00604b8c7e65785f1ae3bc5b56c0f726 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -5509,8 +5509,10 @@ static bool isAllmanLambdaBrace(const FormatToken &Tok) { bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, const FormatToken &Right) const { const FormatToken &Left = *Right.Previous; - if (Right.NewlinesBefore > 1 && Style.MaxEmptyLinesToKeep > 0) + if (Right.NewlinesBefore > 1 && Style.MaxEmptyLinesToKeep > 0 && + (!Style.RemoveEmptyLinesInUnwrappedLines || &Right == Line.First)) { return true; + } if (Style.BreakFunctionDefinitionParameters && Line.MightBeFunctionDecl && Line.mightBeFunctionDefinition() && Left.MightBeFunctionDeclParen && diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index c9625c39e527b4ac9670169a9c94164359b77ffc..bda9850670ab0653139a80138c90980163df73a9 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2504,6 +2504,11 @@ bool UnwrappedLineParser::parseBracedList(bool IsAngleBracket, bool IsEnum) { // Assume there are no blocks inside a braced init list apart // from the ones we explicitly parse out (like lambdas). FormatTok->setBlockKind(BK_BracedInit); + if (!IsAngleBracket) { + auto *Prev = FormatTok->Previous; + if (Prev && Prev->is(tok::greater)) + Prev->setFinalizedType(TT_TemplateCloser); + } nextToken(); parseBracedList(); break; diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index 137467e5a782ce5fec487d7ba4a83c5db2f02971..30dce60b3ff7029d5c6c787150a535566e1405e1 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -2097,6 +2097,86 @@ _HLSL_AVAILABILITY(shadermodel, 6.0) _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_is_first_lane) __attribute__((convergent)) bool WaveIsFirstLane(); +//===----------------------------------------------------------------------===// +// WaveReadLaneAt builtins +//===----------------------------------------------------------------------===// + +// \brief Returns the value of the expression for the given lane index within +// the specified wave. + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) bool WaveReadLaneAt(bool, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) bool2 WaveReadLaneAt(bool2, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) bool3 WaveReadLaneAt(bool3, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) bool4 WaveReadLaneAt(bool4, int32_t); + +#ifdef __HLSL_ENABLE_16_BIT +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int16_t WaveReadLaneAt(int16_t, int32_t); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int16_t2 WaveReadLaneAt(int16_t2, int32_t); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int16_t3 WaveReadLaneAt(int16_t3, int32_t); +_HLSL_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int16_t4 WaveReadLaneAt(int16_t4, int32_t); +#endif + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) half WaveReadLaneAt(half, int32_t); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) half2 WaveReadLaneAt(half2, int32_t); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) half3 WaveReadLaneAt(half3, int32_t); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) half4 WaveReadLaneAt(half4, int32_t); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int WaveReadLaneAt(int, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int2 WaveReadLaneAt(int2, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int3 WaveReadLaneAt(int3, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int4 WaveReadLaneAt(int4, int32_t); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) float WaveReadLaneAt(float, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) float2 WaveReadLaneAt(float2, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) float3 WaveReadLaneAt(float3, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) float4 WaveReadLaneAt(float4, int32_t); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int64_t WaveReadLaneAt(int64_t, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int64_t2 WaveReadLaneAt(int64_t2, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int64_t3 WaveReadLaneAt(int64_t3, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) int64_t4 WaveReadLaneAt(int64_t4, int32_t); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) double WaveReadLaneAt(double, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) double2 WaveReadLaneAt(double2, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) double3 WaveReadLaneAt(double3, int32_t); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at) +__attribute__((convergent)) double4 WaveReadLaneAt(double4, int32_t); + //===----------------------------------------------------------------------===// // sign builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp index 0a9a359cdaf9790d1b6839e77067a96eb564e2ea..dd59cb23236d74b3fae9dd7411d2451a172bf3b7 100644 --- a/clang/lib/Parse/ParseInit.cpp +++ b/clang/lib/Parse/ParseInit.cpp @@ -436,9 +436,9 @@ ExprResult Parser::createEmbedExpr() { ASTContext &Context = Actions.getASTContext(); SourceLocation StartLoc = ConsumeAnnotationToken(); if (Data->BinaryData.size() == 1) { - Res = IntegerLiteral::Create(Context, - llvm::APInt(CHAR_BIT, Data->BinaryData.back()), - Context.UnsignedCharTy, StartLoc); + Res = IntegerLiteral::Create( + Context, llvm::APInt(CHAR_BIT, (unsigned char)Data->BinaryData.back()), + Context.UnsignedCharTy, StartLoc); } else { auto CreateStringLiteralFromStringRef = [&](StringRef Str, QualType Ty) { llvm::APSInt ArraySize = diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 6480e88316a7d50af2b8bbe0537e3939f83f805b..60d647da48f0533a9eaf1f236b0ec34a1dd565b3 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -1518,10 +1518,13 @@ StmtResult Parser::ParseIfStatement(SourceLocation *TrailingElseLoc) { SourceLocation ConstevalLoc; if (Tok.is(tok::kw_constexpr)) { - Diag(Tok, getLangOpts().CPlusPlus17 ? diag::warn_cxx14_compat_constexpr_if - : diag::ext_constexpr_if); - IsConstexpr = true; - ConsumeToken(); + // C23 supports constexpr keyword, but only for object definitions. + if (getLangOpts().CPlusPlus) { + Diag(Tok, getLangOpts().CPlusPlus17 ? diag::warn_cxx14_compat_constexpr_if + : diag::ext_constexpr_if); + IsConstexpr = true; + ConsumeToken(); + } } else { if (Tok.is(tok::exclaim)) { NotLocation = ConsumeToken(); diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 6496a33b8f5a50b5bdd3b010c8586bb8e9b841f2..c76733e9a774b683c275ccf1f038e119c9883c79 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2279,9 +2279,18 @@ public: QualType srcType = ECE->getSubExpr()->getType(); const uint64_t sSize = Ctx.getTypeSize(srcType.getTypePtr()->getPointeeType()); + if (sSize >= dSize) return; + if (const auto *CE = dyn_cast( + ECE->getSubExpr()->IgnoreParens())) { + D = CE->getMethodDecl(); + } + + if (!D) + return; + MsgParam = 4; } Loc = Operation->getBeginLoc(); diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 9b3894767d86291a4fd38dbf9df42267bc3ba344..8caeae5fcf9f8effb29db68bc5a1db13f807273f 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -198,6 +198,7 @@ struct IndirectLocalPathEntry { GslReferenceInit, GslPointerInit, GslPointerAssignment, + DefaultArg, } Kind; Expr *E; union { @@ -609,15 +610,22 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, for (unsigned I = 0, N = std::min(Callee->getNumParams(), Args.size()); I != N; ++I) { + Expr *Arg = Args[I]; + RevertToOldSizeRAII RAII(Path); + if (auto *DAE = dyn_cast(Arg)) { + Path.push_back( + {IndirectLocalPathEntry::DefaultArg, DAE, DAE->getParam()}); + Arg = DAE->getExpr(); + } if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr()) - VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]); + VisitLifetimeBoundArg(Callee->getParamDecl(I), Arg); else if (EnableGSLAnalysis && I == 0) { // Perform GSL analysis for the first argument if (shouldTrackFirstArgument(Callee)) { - VisitGSLPointerArg(Callee, Args[0]); + VisitGSLPointerArg(Callee, Arg); } else if (auto *Ctor = dyn_cast(Call); Ctor && shouldTrackFirstArgumentForConstructor(Ctor)) { - VisitGSLPointerArg(Ctor->getConstructor(), Args[0]); + VisitGSLPointerArg(Ctor->getConstructor(), Arg); } } } @@ -1060,6 +1068,9 @@ static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I, if (!Path[I].Capture->capturesVariable()) continue; return Path[I].E->getSourceRange(); + + case IndirectLocalPathEntry::DefaultArg: + return cast(Path[I].E)->getUsedLocation(); } } return E->getSourceRange(); @@ -1370,7 +1381,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, break; } - case IndirectLocalPathEntry::LambdaCaptureInit: + case IndirectLocalPathEntry::LambdaCaptureInit: { if (!Elem.Capture->capturesVariable()) break; // FIXME: We can't easily tell apart an init-capture from a nested @@ -1383,6 +1394,16 @@ static void checkExprLifetimeImpl(Sema &SemaRef, << nextPathEntryRange(Path, I + 1, L); break; } + + case IndirectLocalPathEntry::DefaultArg: { + const auto *DAE = cast(Elem.E); + const ParmVarDecl *Param = DAE->getParam(); + SemaRef.Diag(Param->getDefaultArgRange().getBegin(), + diag::note_init_with_default_argument) + << Param << nextPathEntryRange(Path, I + 1, L); + break; + } + } } // We didn't lifetime-extend, so don't go any further; we don't need more diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 2913d16fca48235ac892201806bb2ba8999c7002..5f51047b4d7b125244d11deb7cae541854d53bca 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -208,8 +208,6 @@ struct BuiltinTypeDeclBuilder { BuiltinTypeDeclBuilder &addArraySubscriptOperator(bool IsConst) { if (Record->isCompleteDefinition()) return *this; - assert(Fields.count("h") > 0 && - "Subscript operator must be added after the handle."); ASTContext &AST = Record->getASTContext(); QualType ElemTy = AST.Char8Ty; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 5642c821f4ad7cb986f45eae8ee0d47501bdbe17..cb90263ae767298ffe2646210de78899361aee7a 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7883,6 +7883,9 @@ NamedDecl *Sema::ActOnVariableDeclarator( // Handle attributes prior to checking for duplicates in MergeVarDecl ProcessDeclAttributes(S, NewVD, D); + if (getLangOpts().HLSL) + HLSL().ActOnVariableDeclarator(NewVD); + // FIXME: This is probably the wrong location to be doing this and we should // probably be doing this for more attributes (especially for function // pointer attributes such as format, warn_unused_result, etc.). Ideally diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index c7f2fe08bc1d3e9fdac56101ff4136db6a8341b2..7a9b653a96eb047cd74273ea941cc4f80b808bac 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -18283,7 +18283,7 @@ bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New, } // The return types aren't either both pointers or references to a class type. - if (NewClassTy.isNull()) { + if (NewClassTy.isNull() || !NewClassTy->isStructureOrClassType()) { Diag(New->getLocation(), diag::err_different_return_type_for_overriding_virtual_function) << New->getDeclName() << NewTy << OldTy @@ -18348,9 +18348,9 @@ bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New, // The new class type must have the same or less qualifiers as the old type. - if (NewClassTy.isMoreQualifiedThan(OldClassTy)) { + if (!OldClassTy.isAtLeastAsQualifiedAs(NewClassTy)) { Diag(New->getLocation(), - diag::err_covariant_return_type_class_type_more_qualified) + diag::err_covariant_return_type_class_type_not_same_or_less_qualified) << New->getDeclName() << NewTy << OldTy << New->getReturnTypeSourceRange(); Diag(Old->getLocation(), diag::note_overridden_virtual_function) diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 06a03ee11cc29022117b518254bb751f4010c1b5..f6071fba3742663a78047b793e3dba5a38c3ae7b 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -3598,9 +3598,10 @@ ExprResult Sema::ActOnCharacterConstant(const Token &Tok, Scope *UDLScope) { Lit, Tok.getLocation()); } -ExprResult Sema::ActOnIntegerConstant(SourceLocation Loc, uint64_t Val) { +ExprResult Sema::ActOnIntegerConstant(SourceLocation Loc, int64_t Val) { unsigned IntSize = Context.getTargetInfo().getIntWidth(); - return IntegerLiteral::Create(Context, llvm::APInt(IntSize, Val), + return IntegerLiteral::Create(Context, + llvm::APInt(IntSize, Val, /*isSigned=*/true), Context.IntTy, Loc); } diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 137b15c8fcfe98729299ddce6b64ba126920b8ae..1d18a6308e2a509677e24e02578dd1c0a086483d 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -40,9 +40,7 @@ #include using namespace clang; -using llvm::dxil::ResourceClass; - -enum class RegisterType { SRV, UAV, CBuffer, Sampler, C, I, Invalid }; +using RegisterType = HLSLResourceBindingAttr::RegisterType; static RegisterType getRegisterType(ResourceClass RC) { switch (RC) { @@ -58,31 +56,87 @@ static RegisterType getRegisterType(ResourceClass RC) { llvm_unreachable("unexpected ResourceClass value"); } -static RegisterType getRegisterType(StringRef Slot) { +// Converts the first letter of string Slot to RegisterType. +// Returns false if the letter does not correspond to a valid register type. +static bool convertToRegisterType(StringRef Slot, RegisterType *RT) { + assert(RT != nullptr); switch (Slot[0]) { case 't': case 'T': - return RegisterType::SRV; + *RT = RegisterType::SRV; + return true; case 'u': case 'U': - return RegisterType::UAV; + *RT = RegisterType::UAV; + return true; case 'b': case 'B': - return RegisterType::CBuffer; + *RT = RegisterType::CBuffer; + return true; case 's': case 'S': - return RegisterType::Sampler; + *RT = RegisterType::Sampler; + return true; case 'c': case 'C': - return RegisterType::C; + *RT = RegisterType::C; + return true; case 'i': case 'I': - return RegisterType::I; + *RT = RegisterType::I; + return true; default: - return RegisterType::Invalid; + return false; } } +static ResourceClass getResourceClass(RegisterType RT) { + switch (RT) { + case RegisterType::SRV: + return ResourceClass::SRV; + case RegisterType::UAV: + return ResourceClass::UAV; + case RegisterType::CBuffer: + return ResourceClass::CBuffer; + case RegisterType::Sampler: + return ResourceClass::Sampler; + case RegisterType::C: + case RegisterType::I: + llvm_unreachable("unexpected RegisterType value"); + } +} + +DeclBindingInfo *ResourceBindings::addDeclBindingInfo(const VarDecl *VD, + ResourceClass ResClass) { + assert(getDeclBindingInfo(VD, ResClass) == nullptr && + "DeclBindingInfo already added"); + assert(!hasBindingInfoForDecl(VD) || BindingsList.back().Decl == VD); + // VarDecl may have multiple entries for different resource classes. + // DeclToBindingListIndex stores the index of the first binding we saw + // for this decl. If there are any additional ones then that index + // shouldn't be updated. + DeclToBindingListIndex.try_emplace(VD, BindingsList.size()); + return &BindingsList.emplace_back(VD, ResClass); +} + +DeclBindingInfo *ResourceBindings::getDeclBindingInfo(const VarDecl *VD, + ResourceClass ResClass) { + auto Entry = DeclToBindingListIndex.find(VD); + if (Entry != DeclToBindingListIndex.end()) { + for (unsigned Index = Entry->getSecond(); + Index < BindingsList.size() && BindingsList[Index].Decl == VD; + ++Index) { + if (BindingsList[Index].ResClass == ResClass) + return &BindingsList[Index]; + } + } + return nullptr; +} + +bool ResourceBindings::hasBindingInfoForDecl(const VarDecl *VD) const { + return DeclToBindingListIndex.contains(VD); +} + SemaHLSL::SemaHLSL(Sema &S) : SemaBase(S) {} Decl *SemaHLSL::ActOnStartBuffer(Scope *BufferScope, bool CBuffer, @@ -985,88 +1039,70 @@ SemaHLSL::TakeLocForHLSLAttribute(const HLSLAttributedResourceType *RT) { return LocInfo; } -// get the record decl from a var decl that we expect -// represents a resource -static CXXRecordDecl *getRecordDeclFromVarDecl(VarDecl *VD) { - const Type *Ty = VD->getType()->getPointeeOrArrayElementType(); - assert(Ty && "Resource must have an element type."); - - if (Ty->isBuiltinType()) - return nullptr; - - CXXRecordDecl *TheRecordDecl = Ty->getAsCXXRecordDecl(); - assert(TheRecordDecl && "Resource should have a resource type declaration."); - return TheRecordDecl; -} - -static const HLSLAttributedResourceType * -findAttributedResourceTypeOnField(VarDecl *VD) { - assert(VD != nullptr && "expected VarDecl"); - if (RecordDecl *RD = getRecordDeclFromVarDecl(VD)) { - for (auto *FD : RD->fields()) { - if (const HLSLAttributedResourceType *AttrResType = - dyn_cast(FD->getType().getTypePtr())) - return AttrResType; +// Walks though the global variable declaration, collects all resource binding +// requirements and adds them to Bindings +void SemaHLSL::collectResourcesOnUserRecordDecl(const VarDecl *VD, + const RecordType *RT) { + const RecordDecl *RD = RT->getDecl(); + for (FieldDecl *FD : RD->fields()) { + const Type *Ty = FD->getType()->getUnqualifiedDesugaredType(); + + // Unwrap arrays + // FIXME: Calculate array size while unwrapping + assert(!Ty->isIncompleteArrayType() && + "incomplete arrays inside user defined types are not supported"); + while (Ty->isConstantArrayType()) { + const ConstantArrayType *CAT = cast(Ty); + Ty = CAT->getElementType()->getUnqualifiedDesugaredType(); } - } - return nullptr; -} - -// Iterate over RecordType fields and return true if any of them matched the -// register type -static bool ContainsResourceForRegisterType(Sema &S, const RecordType *RT, - RegisterType RegType) { - llvm::SmallVector TypesToScan; - TypesToScan.emplace_back(RT); - while (!TypesToScan.empty()) { - const Type *T = TypesToScan.pop_back_val(); - while (T->isArrayType()) - T = T->getArrayElementTypeNoTypeQual(); - if (T->isIntegralOrEnumerationType() || T->isFloatingType()) { - if (RegType == RegisterType::C) - return true; - } - const RecordType *RT = T->getAs(); - if (!RT) + if (!Ty->isRecordType()) continue; - const RecordDecl *RD = RT->getDecl(); - for (FieldDecl *FD : RD->fields()) { - const Type *FieldTy = FD->getType().getTypePtr(); - if (const HLSLAttributedResourceType *AttrResType = - dyn_cast(FieldTy)) { - ResourceClass RC = AttrResType->getAttrs().ResourceClass; - if (getRegisterType(RC) == RegType) - return true; - } else { - TypesToScan.emplace_back(FD->getType().getTypePtr()); - } + if (const HLSLAttributedResourceType *AttrResType = + HLSLAttributedResourceType::findHandleTypeOnResource(Ty)) { + // Add a new DeclBindingInfo to Bindings if it does not already exist + ResourceClass RC = AttrResType->getAttrs().ResourceClass; + DeclBindingInfo *DBI = Bindings.getDeclBindingInfo(VD, RC); + if (!DBI) + Bindings.addDeclBindingInfo(VD, RC); + } else if (const RecordType *RT = dyn_cast(Ty)) { + // Recursively scan embedded struct or class; it would be nice to do this + // without recursion, but tricky to correctly calculate the size of the + // binding, which is something we are probably going to need to do later + // on. Hopefully nesting of structs in structs too many levels is + // unlikely. + collectResourcesOnUserRecordDecl(VD, RT); } } - return false; } -static void CheckContainsResourceForRegisterType(Sema &S, - SourceLocation &ArgLoc, - Decl *D, RegisterType RegType, - bool SpecifiedSpace) { +// Diagnore localized register binding errors for a single binding; does not +// diagnose resource binding on user record types, that will be done later +// in processResourceBindingOnDecl based on the information collected in +// collectResourcesOnVarDecl. +// Returns false if the register binding is not valid. +static bool DiagnoseLocalRegisterBinding(Sema &S, SourceLocation &ArgLoc, + Decl *D, RegisterType RegType, + bool SpecifiedSpace) { int RegTypeNum = static_cast(RegType); // check if the decl type is groupshared if (D->hasAttr()) { S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; - return; + return false; } // Cbuffers and Tbuffers are HLSLBufferDecl types if (HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(D)) { ResourceClass RC = CBufferOrTBuffer->isCBuffer() ? ResourceClass::CBuffer : ResourceClass::SRV; - if (RegType != getRegisterType(RC)) - S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch) - << RegTypeNum; - return; + if (RegType == getRegisterType(RC)) + return true; + + S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch) + << RegTypeNum; + return false; } // Samplers, UAVs, and SRVs are VarDecl types @@ -1075,11 +1111,14 @@ static void CheckContainsResourceForRegisterType(Sema &S, // Resource if (const HLSLAttributedResourceType *AttrResType = - findAttributedResourceTypeOnField(VD)) { - if (RegType != getRegisterType(AttrResType->getAttrs().ResourceClass)) - S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch) - << RegTypeNum; - return; + HLSLAttributedResourceType::findHandleTypeOnResource( + VD->getType().getTypePtr())) { + if (RegType == getRegisterType(AttrResType->getAttrs().ResourceClass)) + return true; + + S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch) + << RegTypeNum; + return false; } const clang::Type *Ty = VD->getType().getTypePtr(); @@ -1105,51 +1144,44 @@ static void CheckContainsResourceForRegisterType(Sema &S, else S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; } - } else if (Ty->isRecordType()) { - // Class/struct types - walk the declaration and check each field and - // subclass - if (!ContainsResourceForRegisterType(S, Ty->getAs(), RegType)) - S.Diag(D->getLocation(), diag::warn_hlsl_user_defined_type_missing_member) - << RegTypeNum; - } else { - // Anything else is an error - S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; + return false; } + if (Ty->isRecordType()) + // RecordTypes will be diagnosed in processResourceBindingOnDecl + // that is called from ActOnVariableDeclarator + return true; + + // Anything else is an error + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; + return false; } -static void ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl, +static bool ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl, RegisterType regType) { // make sure that there are no two register annotations // applied to the decl with the same register type bool RegisterTypesDetected[5] = {false}; - RegisterTypesDetected[static_cast(regType)] = true; - // we need a static map to keep track of previous conflicts - // so that we don't emit the same error multiple times - static std::map> PreviousConflicts; - for (auto it = TheDecl->attr_begin(); it != TheDecl->attr_end(); ++it) { if (HLSLResourceBindingAttr *attr = dyn_cast(*it)) { - RegisterType otherRegType = getRegisterType(attr->getSlot()); + RegisterType otherRegType = attr->getRegisterType(); if (RegisterTypesDetected[static_cast(otherRegType)]) { - if (PreviousConflicts[TheDecl].count(otherRegType)) - continue; int otherRegTypeNum = static_cast(otherRegType); S.Diag(TheDecl->getLocation(), diag::err_hlsl_duplicate_register_annotation) << otherRegTypeNum; - PreviousConflicts[TheDecl].insert(otherRegType); - } else { - RegisterTypesDetected[static_cast(otherRegType)] = true; + return false; } + RegisterTypesDetected[static_cast(otherRegType)] = true; } } + return true; } -static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, +static bool DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, Decl *D, RegisterType RegType, bool SpecifiedSpace) { @@ -1159,10 +1191,11 @@ static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, "expecting VarDecl or HLSLBufferDecl"); // check if the declaration contains resource matching the register type - CheckContainsResourceForRegisterType(S, ArgLoc, D, RegType, SpecifiedSpace); + if (!DiagnoseLocalRegisterBinding(S, ArgLoc, D, RegType, SpecifiedSpace)) + return false; // next, if multiple register annotations exist, check that none conflict. - ValidateMultipleRegisterAnnotations(S, D, RegType); + return ValidateMultipleRegisterAnnotations(S, D, RegType); } void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { @@ -1203,23 +1236,23 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { Slot = Str; } - RegisterType regType; + RegisterType RegType; + unsigned SlotNum = 0; + unsigned SpaceNum = 0; // Validate. if (!Slot.empty()) { - regType = getRegisterType(Slot); - if (regType == RegisterType::I) { - Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_i); + if (!convertToRegisterType(Slot, &RegType)) { + Diag(ArgLoc, diag::err_hlsl_binding_type_invalid) << Slot.substr(0, 1); return; } - if (regType == RegisterType::Invalid) { - Diag(ArgLoc, diag::err_hlsl_binding_type_invalid) << Slot.substr(0, 1); + if (RegType == RegisterType::I) { + Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_i); return; } - StringRef SlotNum = Slot.substr(1); - unsigned Num = 0; - if (SlotNum.getAsInteger(10, Num)) { + StringRef SlotNumStr = Slot.substr(1); + if (SlotNumStr.getAsInteger(10, SlotNum)) { Diag(ArgLoc, diag::err_hlsl_unsupported_register_number); return; } @@ -1229,20 +1262,22 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { Diag(SpaceArgLoc, diag::err_hlsl_expected_space) << Space; return; } - StringRef SpaceNum = Space.substr(5); - unsigned Num = 0; - if (SpaceNum.getAsInteger(10, Num)) { + StringRef SpaceNumStr = Space.substr(5); + if (SpaceNumStr.getAsInteger(10, SpaceNum)) { Diag(SpaceArgLoc, diag::err_hlsl_expected_space) << Space; return; } - DiagnoseHLSLRegisterAttribute(SemaRef, ArgLoc, TheDecl, regType, - SpecifiedSpace); + if (!DiagnoseHLSLRegisterAttribute(SemaRef, ArgLoc, TheDecl, RegType, + SpecifiedSpace)) + return; HLSLResourceBindingAttr *NewAttr = HLSLResourceBindingAttr::Create(getASTContext(), Slot, Space, AL); - if (NewAttr) + if (NewAttr) { + NewAttr->setBinding(RegType, SlotNum, SpaceNum); TheDecl->addAttr(NewAttr); + } } void SemaHLSL::handleParamModifierAttr(Decl *D, const ParsedAttr &AL) { @@ -1751,6 +1786,22 @@ static bool CheckScalarOrVector(Sema *S, CallExpr *TheCall, QualType Scalar, return false; } +static bool CheckAnyScalarOrVector(Sema *S, CallExpr *TheCall, + unsigned ArgIndex) { + assert(TheCall->getNumArgs() >= ArgIndex); + QualType ArgType = TheCall->getArg(ArgIndex)->getType(); + auto *VTy = ArgType->getAs(); + // not the scalar or vector + if (!(ArgType->isScalarType() || + (VTy && VTy->getElementType()->isScalarType()))) { + S->Diag(TheCall->getArg(0)->getBeginLoc(), + diag::err_typecheck_expect_any_scalar_or_vector) + << ArgType; + return true; + } + return false; +} + static bool CheckBoolSelect(Sema *S, CallExpr *TheCall) { assert(TheCall->getNumArgs() == 3); Expr *Arg1 = TheCall->getArg(1); @@ -1993,6 +2044,29 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { return true; break; } + case Builtin::BI__builtin_hlsl_wave_read_lane_at: { + if (SemaRef.checkArgCount(TheCall, 2)) + return true; + + // Ensure index parameter type can be interpreted as a uint + ExprResult Index = TheCall->getArg(1); + QualType ArgTyIndex = Index.get()->getType(); + if (!ArgTyIndex->isIntegerType()) { + SemaRef.Diag(TheCall->getArg(1)->getBeginLoc(), + diag::err_typecheck_convert_incompatible) + << ArgTyIndex << SemaRef.Context.UnsignedIntTy << 1 << 0 << 0; + return true; + } + + // Ensure input expr type is a scalar/vector and the same as the return type + if (CheckAnyScalarOrVector(&SemaRef, TheCall, 0)) + return true; + + ExprResult Expr = TheCall->getArg(0); + QualType ArgTyExpr = Expr.get()->getType(); + TheCall->setType(ArgTyExpr); + break; + } case Builtin::BI__builtin_hlsl_wave_get_lane_index: { if (SemaRef.checkArgCount(TheCall, 0)) return true; @@ -2050,6 +2124,7 @@ bool SemaHLSL::IsIntangibleType(clang::QualType QT) { CXXRecordDecl *RD = RT->getAsCXXRecordDecl(); assert(RD != nullptr && "all HLSL struct and classes should be CXXRecordDecl"); + assert(RD->isCompleteDefinition() && "expecting complete type"); return RD->isHLSLIntangible(); } @@ -2235,3 +2310,95 @@ QualType SemaHLSL::getInoutParameterType(QualType Ty) { Ty.addRestrict(); return Ty; } + +void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) { + if (VD->hasGlobalStorage()) { + // make sure the declaration has a complete type + if (SemaRef.RequireCompleteType( + VD->getLocation(), + SemaRef.getASTContext().getBaseElementType(VD->getType()), + diag::err_typecheck_decl_incomplete_type)) { + VD->setInvalidDecl(); + return; + } + + // find all resources on decl + if (IsIntangibleType(VD->getType())) + collectResourcesOnVarDecl(VD); + + // process explicit bindings + processExplicitBindingsOnDecl(VD); + } +} + +// Walks though the global variable declaration, collects all resource binding +// requirements and adds them to Bindings +void SemaHLSL::collectResourcesOnVarDecl(VarDecl *VD) { + assert(VD->hasGlobalStorage() && IsIntangibleType(VD->getType()) && + "expected global variable that contains HLSL resource"); + + // Cbuffers and Tbuffers are HLSLBufferDecl types + if (const HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(VD)) { + Bindings.addDeclBindingInfo(VD, CBufferOrTBuffer->isCBuffer() + ? ResourceClass::CBuffer + : ResourceClass::SRV); + return; + } + + // Unwrap arrays + // FIXME: Calculate array size while unwrapping + const Type *Ty = VD->getType()->getUnqualifiedDesugaredType(); + while (Ty->isConstantArrayType()) { + const ConstantArrayType *CAT = cast(Ty); + Ty = CAT->getElementType()->getUnqualifiedDesugaredType(); + } + + // Resource (or array of resources) + if (const HLSLAttributedResourceType *AttrResType = + HLSLAttributedResourceType::findHandleTypeOnResource(Ty)) { + Bindings.addDeclBindingInfo(VD, AttrResType->getAttrs().ResourceClass); + return; + } + + // User defined record type + if (const RecordType *RT = dyn_cast(Ty)) + collectResourcesOnUserRecordDecl(VD, RT); +} + +// Walks though the explicit resource binding attributes on the declaration, +// and makes sure there is a resource that matched the binding and updates +// DeclBindingInfoLists +void SemaHLSL::processExplicitBindingsOnDecl(VarDecl *VD) { + assert(VD->hasGlobalStorage() && "expected global variable"); + + for (Attr *A : VD->attrs()) { + HLSLResourceBindingAttr *RBA = dyn_cast(A); + if (!RBA) + continue; + + RegisterType RT = RBA->getRegisterType(); + assert(RT != RegisterType::I && "invalid or obsolete register type should " + "never have an attribute created"); + + if (RT == RegisterType::C) { + if (Bindings.hasBindingInfoForDecl(VD)) + SemaRef.Diag(VD->getLocation(), + diag::warn_hlsl_user_defined_type_missing_member) + << static_cast(RT); + continue; + } + + // Find DeclBindingInfo for this binding and update it, or report error + // if it does not exist (user type does to contain resources with the + // expected resource class). + ResourceClass RC = getResourceClass(RT); + if (DeclBindingInfo *BI = Bindings.getDeclBindingInfo(VD, RC)) { + // update binding info + BI->setBindingAttribute(RBA, BindingType::Explicit); + } else { + SemaRef.Diag(VD->getLocation(), + diag::warn_hlsl_user_defined_type_missing_member) + << static_cast(RT); + } + } +} diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index f3f62474d06441585d8928d67142a427239b759f..e5db11369221a4556eab2819515c542304d52f4e 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -3215,6 +3215,9 @@ addAssociatedClassesAndNamespaces(AssociatedLookup &Result, QualType Ty) { // Array parameter types are treated as fundamental types. case Type::ArrayParameter: break; + + case Type::HLSLAttributedResource: + T = cast(T)->getWrappedType().getTypePtr(); } if (Queue.empty()) diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 22aedbc70df8cceb2b19c718169f2a0e03532790..d33b0d0c1c30186e0f0f0f9b8ebd3e7416657bed 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -2216,7 +2216,7 @@ ExprResult SemaOpenACC::CheckGangExpr(OpenACCGangKind GK, Expr *E) { case OpenACCGangKind::Static: return CheckGangStaticExpr(*this, E); } - } + } break; default: llvm_unreachable("Non compute construct in active compute construct?"); } diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 11ea4f3a8b1697b492432a29d9cc27cf6c443258..6f42a4344389345e6d758bfcded769b66caa2fe3 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -5697,7 +5697,9 @@ StmtResult SemaOpenMP::ActOnOpenMPCanonicalLoop(Stmt *AStmt) { llvm_unreachable("unhandled unary increment operator"); } Step = IntegerLiteral::Create( - Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), Direction), LogicalTy, {}); + Ctx, + llvm::APInt(Ctx.getIntWidth(LogicalTy), Direction, /*isSigned=*/true), + LogicalTy, {}); } else if (auto *IncBin = dyn_cast(Inc)) { if (IncBin->getOpcode() == BO_AddAssign) { Step = IncBin->getRHS(); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 1205e85b4e6f531a01af00a3ea4ba89f263f6e9b..7b86299561a3653592aca9f6512d15cea5736012 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1798,6 +1798,23 @@ TryImplicitConversion(Sema &S, Expr *From, QualType ToType, return ICS; } + if (S.getLangOpts().HLSL && ToType->isHLSLAttributedResourceType() && + FromType->isHLSLAttributedResourceType()) { + auto *ToResType = cast(ToType); + auto *FromResType = cast(FromType); + if (S.Context.hasSameUnqualifiedType(ToResType->getWrappedType(), + FromResType->getWrappedType()) && + S.Context.hasSameUnqualifiedType(ToResType->getContainedType(), + FromResType->getContainedType()) && + ToResType->getAttrs() == FromResType->getAttrs()) { + ICS.setStandard(); + ICS.Standard.setAsIdentityConversion(); + ICS.Standard.setFromType(FromType); + ICS.Standard.setAllToTypes(ToType); + return ICS; + } + } + return TryUserDefinedConversion(S, From, ToType, SuppressUserConversions, AllowExplicit, InOverloadResolution, CStyle, AllowObjCWritebackConversion, diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 294eb8e3353cde75d39b0a9c3e32a1c9357b9d0a..62f13610b5285c2bf418b7cd7474785547286046 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -6074,6 +6074,13 @@ bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier( llvm_unreachable("Invalid NestedNameSpecifier::Kind!"); } +bool UnnamedLocalNoLinkageFinder::VisitHLSLAttributedResourceType( + const HLSLAttributedResourceType *T) { + if (T->hasContainedType() && Visit(T->getContainedType())) + return true; + return Visit(T->getWrappedType()); +} + bool Sema::CheckTemplateArgument(TypeSourceInfo *ArgInfo) { assert(ArgInfo && "invalid TypeSourceInfo"); QualType Arg = ArgInfo->getType(); diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 7cfb8d687c796a92fe117352d597c0c53515b774..db1d7fa237131a8441189a8091011f26f6d2ce5b 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -2449,6 +2449,7 @@ static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch( case Type::PackExpansion: case Type::Pipe: case Type::ArrayParameter: + case Type::HLSLAttributedResource: // No template argument deduction for these types return TemplateDeductionResult::Success; @@ -6844,6 +6845,16 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T, OnlyDeduced, Depth, Used); break; + case Type::HLSLAttributedResource: + MarkUsedTemplateParameters( + Ctx, cast(T)->getWrappedType(), OnlyDeduced, + Depth, Used); + if (cast(T)->hasContainedType()) + MarkUsedTemplateParameters( + Ctx, cast(T)->getContainedType(), + OnlyDeduced, Depth, Used); + break; + // None of these types have any template parameters in them. case Type::Builtin: case Type::VariableArray: diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 8c7f694c09042e6b1f515638d1fa1a3e2500b0e2..8665c099903dc3df85a47691a51f1bd93755f195 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -237,7 +237,7 @@ struct TemplateInstantiationArgumentCollecter if (Innermost) AddInnermostTemplateArguments(VTPSD); else if (ForConstraintInstantiation) - AddOuterTemplateArguments(VTPSD, VTPSD->getTemplateArgs().asArray(), + AddOuterTemplateArguments(VTPSD, VTPSD->getInjectedTemplateArgs(), /*Final=*/false); if (VTPSD->isMemberSpecialization()) @@ -274,7 +274,7 @@ struct TemplateInstantiationArgumentCollecter if (Innermost) AddInnermostTemplateArguments(CTPSD); else if (ForConstraintInstantiation) - AddOuterTemplateArguments(CTPSD, CTPSD->getTemplateArgs().asArray(), + AddOuterTemplateArguments(CTPSD, CTPSD->getInjectedTemplateArgs(), /*Final=*/false); if (CTPSD->isMemberSpecialization()) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 2298fe39850de5e517adafcfa6a2dac473b67dfd..e043806eadd6ac7dd69a952a658d702c5542d28b 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -273,16 +273,29 @@ class TrivialFunctionAnalysisVisitor return true; } - template - bool WithCachedResult(const Stmt *S, CheckFunction Function) { - // If the statement isn't in the cache, conservatively assume that - // it's not trivial until analysis completes. Insert false to the cache - // first to avoid infinite recursion. - auto [It, IsNew] = Cache.insert(std::make_pair(S, false)); + template + bool WithCachedResult(const StmtOrDecl *S, CheckFunction Function) { + auto CacheIt = Cache.find(S); + if (CacheIt != Cache.end()) + return CacheIt->second; + + // Treat a recursive statement to be trivial until proven otherwise. + auto [RecursiveIt, IsNew] = RecursiveFn.insert(std::make_pair(S, true)); if (!IsNew) - return It->second; + return RecursiveIt->second; + bool Result = Function(); + + if (!Result) { + for (auto &It : RecursiveFn) + It.second = false; + } + RecursiveIt = RecursiveFn.find(S); + assert(RecursiveIt != RecursiveFn.end()); + Result = RecursiveIt->second; + RecursiveFn.erase(RecursiveIt); Cache[S] = Result; + return Result; } @@ -292,16 +305,7 @@ public: TrivialFunctionAnalysisVisitor(CacheTy &Cache) : Cache(Cache) {} bool IsFunctionTrivial(const Decl *D) { - auto CacheIt = Cache.find(D); - if (CacheIt != Cache.end()) - return CacheIt->second; - - // Treat a recursive function call to be trivial until proven otherwise. - auto [RecursiveIt, IsNew] = RecursiveFn.insert(std::make_pair(D, true)); - if (!IsNew) - return RecursiveIt->second; - - bool Result = [&]() { + return WithCachedResult(D, [&]() { if (auto *CtorDecl = dyn_cast(D)) { for (auto *CtorInit : CtorDecl->inits()) { if (!Visit(CtorInit->getInit())) @@ -312,20 +316,7 @@ public: if (!Body) return false; return Visit(Body); - }(); - - if (!Result) { - // D and its mutually recursive callers are all non-trivial. - for (auto &It : RecursiveFn) - It.second = false; - } - RecursiveIt = RecursiveFn.find(D); - assert(RecursiveIt != RecursiveFn.end()); - Result = RecursiveIt->second; - RecursiveFn.erase(RecursiveIt); - Cache[D] = Result; - - return Result; + }); } bool VisitStmt(const Stmt *S) { @@ -586,11 +577,6 @@ bool TrivialFunctionAnalysis::isTrivialImpl( bool TrivialFunctionAnalysis::isTrivialImpl( const Stmt *S, TrivialFunctionAnalysis::CacheTy &Cache) { - // If the statement isn't in the cache, conservatively assume that - // it's not trivial until analysis completes. Unlike a function case, - // we don't insert an entry into the cache until Visit returns - // since Visit* functions themselves make use of the cache. - TrivialFunctionAnalysisVisitor V(Cache); bool Result = V.Visit(S); assert(Cache.contains(S) && "Top-level statement not properly cached!"); diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp index 68c8a8dc6825076bf78e3e74f05ecb2fb4c065ff..c4479db14b791d173e6de9a1804cca01f3037f34 100644 --- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp +++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp @@ -2695,7 +2695,7 @@ ConditionBRVisitor::VisitNodeImpl(const ExplodedNode *N, PathSensitiveBugReport &BR) { ProgramPoint ProgPoint = N->getLocation(); const std::pair &Tags = - ExprEngine::geteagerlyAssumeBinOpBifurcationTags(); + ExprEngine::getEagerlyAssumeBifurcationTags(); // If an assumption was made on a branch, it should be caught // here by looking at the state transition. diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 24c5a01c9759b44c44ce1aadb3913d1ad566c982..5ff692ef562d693d320f3d89f12e0cc70a919e76 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -2137,7 +2137,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, (B->isRelationalOp() || B->isEqualityOp())) { ExplodedNodeSet Tmp; VisitBinaryOperator(cast(S), Pred, Tmp); - evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, cast(S)); + evalEagerlyAssumeBifurcation(Dst, Tmp, cast(S)); } else VisitBinaryOperator(cast(S), Pred, Dst); @@ -2410,7 +2410,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, if (AMgr.options.ShouldEagerlyAssume && (U->getOpcode() == UO_LNot)) { ExplodedNodeSet Tmp; VisitUnaryOperator(U, Pred, Tmp); - evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, U); + evalEagerlyAssumeBifurcation(Dst, Tmp, U); } else VisitUnaryOperator(U, Pred, Dst); @@ -3750,23 +3750,20 @@ void ExprEngine::evalLocation(ExplodedNodeSet &Dst, BldrTop.addNodes(Tmp); } -std::pair -ExprEngine::geteagerlyAssumeBinOpBifurcationTags() { - static SimpleProgramPointTag - eagerlyAssumeBinOpBifurcationTrue(TagProviderName, - "Eagerly Assume True"), - eagerlyAssumeBinOpBifurcationFalse(TagProviderName, - "Eagerly Assume False"); - return std::make_pair(&eagerlyAssumeBinOpBifurcationTrue, - &eagerlyAssumeBinOpBifurcationFalse); +std::pair +ExprEngine::getEagerlyAssumeBifurcationTags() { + static SimpleProgramPointTag TrueTag(TagProviderName, "Eagerly Assume True"), + FalseTag(TagProviderName, "Eagerly Assume False"); + + return std::make_pair(&TrueTag, &FalseTag); } -void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst, - ExplodedNodeSet &Src, - const Expr *Ex) { +void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, + ExplodedNodeSet &Src, + const Expr *Ex) { StmtNodeBuilder Bldr(Src, Dst, *currBldrCtx); - for (const auto Pred : Src) { + for (ExplodedNode *Pred : Src) { // Test if the previous node was as the same expression. This can happen // when the expression fails to evaluate to anything meaningful and // (as an optimization) we don't generate a node. @@ -3775,28 +3772,26 @@ void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst, continue; } - ProgramStateRef state = Pred->getState(); - SVal V = state->getSVal(Ex, Pred->getLocationContext()); + ProgramStateRef State = Pred->getState(); + SVal V = State->getSVal(Ex, Pred->getLocationContext()); std::optional SEV = V.getAs(); if (SEV && SEV->isExpression()) { - const std::pair &tags = - geteagerlyAssumeBinOpBifurcationTags(); + const auto &[TrueTag, FalseTag] = getEagerlyAssumeBifurcationTags(); - ProgramStateRef StateTrue, StateFalse; - std::tie(StateTrue, StateFalse) = state->assume(*SEV); + auto [StateTrue, StateFalse] = State->assume(*SEV); // First assume that the condition is true. if (StateTrue) { SVal Val = svalBuilder.makeIntVal(1U, Ex->getType()); StateTrue = StateTrue->BindExpr(Ex, Pred->getLocationContext(), Val); - Bldr.generateNode(Ex, Pred, StateTrue, tags.first); + Bldr.generateNode(Ex, Pred, StateTrue, TrueTag); } // Next, assume that the condition is false. if (StateFalse) { SVal Val = svalBuilder.makeIntVal(0U, Ex->getType()); StateFalse = StateFalse->BindExpr(Ex, Pred->getLocationContext(), Val); - Bldr.generateNode(Ex, Pred, StateFalse, tags.second); + Bldr.generateNode(Ex, Pred, StateFalse, FalseTag); } } } diff --git a/clang/test/.gitattributes b/clang/test/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..160fc6cf561751d5f02e0e70f635b41459a1df80 --- /dev/null +++ b/clang/test/.gitattributes @@ -0,0 +1,4 @@ +FixIt/fixit-newline-style.c text eol=crlf +Frontend/system-header-line-directive-ms-lineendings.c text eol=crlf +Frontend/rewrite-includes-mixed-eol-crlf.* text eol=crlf +clang/test/Frontend/rewrite-includes-mixed-eol-lf.h text eolf=lf diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp index caf3ac97fd1c045e217861bc010f6f25e988fc2a..8e6d802e93295c3747a800102787b4b3ff58729e 100644 --- a/clang/test/AST/ByteCode/placement-new.cpp +++ b/clang/test/AST/ByteCode/placement-new.cpp @@ -52,6 +52,20 @@ consteval auto ok4() { } static_assert(ok4() == 37); +consteval int ok5() { + int i; + new (&i) int[1]{1}; + + struct S { + int a; int b; + } s; + new (&s) S[1]{{12, 13}}; + + return 25; + // return s.a + s.b; FIXME: Broken in the current interpreter. +} +static_assert(ok5() == 25); + /// FIXME: Broken in both interpreters. #if 0 consteval int ok5() { @@ -286,3 +300,14 @@ namespace UsedToCrash { } int alloc1 = (alloc(), 0); } + +constexpr bool change_union_member() { + union U { + int a; + int b; + }; + U u = {.a = 1}; + std::construct_at(&u.b, 2); + return u.b == 2; +} +static_assert(change_union_member()); diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl index 55c0dfa2eaa533707fc5bef70aa66f21612f8643..e6ce73dbd962f7b2636a47ed1b984fb4e6445023 100644 --- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl @@ -32,7 +32,6 @@ RWBuffer Buffer; // CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ':'__hlsl_resource_t' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' @@ -59,5 +58,4 @@ RWBuffer Buffer; // CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-SAME: ':'__hlsl_resource_t' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl index b31db8ce59f22475295464011e4d853db0c13d4a..9c1630f6f570aaa29662dd7ce4fa625581cb9ad6 100644 --- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl @@ -1,66 +1,64 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s - - -// This test tests two different AST generations. The "EMPTY" test mode verifies -// the AST generated by forward declaration of the HLSL types which happens on -// initializing the HLSL external AST with an AST Context. - -// The non-empty mode has a use that requires the StructuredBuffer type be complete, -// which results in the AST being populated by the external AST source. That -// case covers the full implementation of the template declaration and the -// instantiated specialization. - -// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer -// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type -// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer -// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final - -// There should be no more occurrances of StructuredBuffer -// EMPTY-NOT: StructuredBuffer - -#ifndef EMPTY - -StructuredBuffer Buffer; - -#endif - -// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer -// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type -// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer definition - -// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-SAME: ':'__hlsl_resource_t' -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer - -// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' -// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' -// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} -// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const StructuredBuffer' lvalue implicit this -// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline - -// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' -// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' -// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} -// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'StructuredBuffer' lvalue implicit this -// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline - -// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class StructuredBuffer definition - -// CHECK: TemplateArgument type 'float' -// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' -// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-SAME: ':'__hlsl_resource_t' -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s + + +// This test tests two different AST generations. The "EMPTY" test mode verifies +// the AST generated by forward declaration of the HLSL types which happens on +// initializing the HLSL external AST with an AST Context. + +// The non-empty mode has a use that requires the StructuredBuffer type be complete, +// which results in the AST being populated by the external AST source. That +// case covers the full implementation of the template declaration and the +// instantiated specialization. + +// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer +// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer +// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final + +// There should be no more occurrances of StructuredBuffer +// EMPTY-NOT: StructuredBuffer + +#ifndef EMPTY + +StructuredBuffer Buffer; + +#endif + +// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer +// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer definition + +// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer + +// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' +// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const StructuredBuffer' lvalue implicit this +// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline + +// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' +// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'StructuredBuffer' lvalue implicit this +// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline + +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class StructuredBuffer definition + +// CHECK: TemplateArgument type 'float' +// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' +// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer diff --git a/clang/test/AST/ast-dump-aarch64-sve-types.c b/clang/test/AST/ast-dump-aarch64-sve-types.c index b5a0b00b492803dc22ca8604a2d068dca7fcb68c..386133e05b1d13da967ba840ee0a792109934aaf 100644 --- a/clang/test/AST/ast-dump-aarch64-sve-types.c +++ b/clang/test/AST/ast-dump-aarch64-sve-types.c @@ -45,6 +45,9 @@ // CHECK: TypedefDecl {{.*}} implicit __SVBfloat16_t '__SVBfloat16_t' // CHECK-NEXT: -BuiltinType {{.*}} '__SVBfloat16_t' +// CHECK: TypedefDecl {{.*}} implicit __SVMfloat8_t '__SVMfloat8_t' +// CHECK-NEXT: -BuiltinType {{.*}} '__SVMfloat8_t' + // CHECK: TypedefDecl {{.*}} implicit __SVBool_t '__SVBool_t' // CHECK-NEXT: -BuiltinType {{.*}} '__SVBool_t' diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp index 25776870dd3ae0e855adc49ec63841f75bb6ddb1..b5f6b8535bf41812b36d01d9b788440316041e95 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp @@ -289,3 +289,42 @@ void foo() { } } // namespace local_assignment_to_global + +namespace local_var_in_recursive_function { + +struct TreeNode { + Ref create() { return Ref(*new TreeNode); } + + void ref() const { ++refCount; } + void deref() const { + if (!--refCount) + delete this; + } + + int recursiveCost(); + int recursiveWeight(); + int weight(); + + int cost { 0 }; + mutable unsigned refCount { 0 }; + TreeNode* nextSibling { nullptr }; + TreeNode* firstChild { nullptr }; +}; + +int TreeNode::recursiveCost() { + // no warnings + unsigned totalCost = cost; + for (TreeNode* node = firstChild; node; node = node->nextSibling) + totalCost += recursiveCost(); + return totalCost; +} + +int TreeNode::recursiveWeight() { + unsigned totalCost = weight(); + for (TreeNode* node = firstChild; node; node = node->nextSibling) + // expected-warning@-1{{Local variable 'node' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}} + totalCost += recursiveWeight(); + return totalCost; +} + +} // namespace local_var_in_recursive_function diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp index 1a42de90105a55e894b34095ba8d7a5ad3d2ee6c..10da776f81575cea543ec824c412b6dce6a1f11b 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp @@ -259,6 +259,15 @@ public: void mutuallyRecursive8() { mutuallyRecursive9(); someFunction(); } void mutuallyRecursive9() { mutuallyRecursive8(); } + int recursiveCost() { + unsigned totalCost = 0; + for (unsigned i = 0; i < sizeof(children)/sizeof(*children); ++i) { + if (auto* child = children[i]) + totalCost += child->recursiveCost(); + } + return totalCost; + } + int trivial1() { return 123; } float trivial2() { return 0.3; } float trivial3() { return (float)0.4; } @@ -448,6 +457,7 @@ public: Number* number { nullptr }; ComplexNumber complex; Enum enumValue { Enum::Value1 }; + RefCounted* children[4]; }; unsigned RefCounted::s_v = 0; @@ -558,6 +568,8 @@ public: getFieldTrivial().mutuallyRecursive9(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} + getFieldTrivial().recursiveCost(); // no-warning + getFieldTrivial().someFunction(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} getFieldTrivial().nonTrivial1(); diff --git a/clang/test/Analysis/string.c b/clang/test/Analysis/string.c index 79b4877eedbd9c91c8aad914788ef33edd7d6cbc..2e0a49d083b0b06ed6681586d552d4f640086034 100644 --- a/clang/test/Analysis/string.c +++ b/clang/test/Analysis/string.c @@ -361,6 +361,10 @@ void strcpy_fn_const(char *x) { strcpy(x, (const char*)&strcpy_fn); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}} } +void strcpy_fn_dst(const char *x) { + strcpy((char*)&strcpy_fn, x); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}} +} + extern int globalInt; void strcpy_effects(char *x, char *y) { char a = x[0]; @@ -469,8 +473,22 @@ void strcat_null_src(char *x) { strcat(x, NULL); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}} } -void strcat_fn(char *x) { - strcat(x, (char*)&strcat_fn); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn', which is not a null-terminated string}} +void strcat_fn_dst(const char *x) { + strcat((char*)&strcat_fn_dst, x); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_dst', which is not a null-terminated string}} +} + +void strcat_fn_src(char *x) { + strcat(x, (char*)&strcat_fn_src); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_src', which is not a null-terminated string}} +} + +void strcat_label_dst(const char *x) { +label: + strcat((char*)&&label, x); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}} +} + +void strcat_label_src(char *x) { +label: + strcat(x, (char*)&&label); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}} } void strcat_effects(char *y) { @@ -568,8 +586,12 @@ void strncpy_null_src(char *x) { strncpy(x, NULL, 5); // expected-warning{{Null pointer passed as 2nd argument to string copy function}} } -void strncpy_fn(char *x) { - strncpy(x, (char*)&strcpy_fn, 5); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}} +void strncpy_fn_src(char *x) { + strncpy(x, (char*)&strncpy_fn_src, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_src', which is not a null-terminated string}} +} + +void strncpy_fn_dst(const char *x) { + strncpy((char*)&strncpy_fn_dst, x, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_dst', which is not a null-terminated string}} } void strncpy_effects(char *x, char *y) { @@ -680,8 +702,12 @@ void strncat_null_src(char *x) { strncat(x, NULL, 4); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}} } -void strncat_fn(char *x) { - strncat(x, (char*)&strncat_fn, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn', which is not a null-terminated string}} +void strncat_fn_src(char *x) { + strncat(x, (char*)&strncat_fn_src, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_src', which is not a null-terminated string}} +} + +void strncat_fn_dst(const char *x) { + strncat((char*)&strncat_fn_dst, x, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_dst', which is not a null-terminated string}} } void strncat_effects(char *y) { @@ -921,6 +947,14 @@ int strcmp_null_argument(char *a) { return strcmp(a, b); // expected-warning{{Null pointer passed as 2nd argument to string comparison function}} } +void strcmp_fn_r(char *x) { + strcmp(x, (char*)&strcmp_null_argument); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}} +} + +void strcmp_fn_l(char *x) { + strcmp((char*)&strcmp_null_argument, x); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}} +} + //===----------------------------------------------------------------------=== // strncmp() //===----------------------------------------------------------------------=== diff --git a/clang/test/Analysis/string.cpp b/clang/test/Analysis/string.cpp index 1be6c21466cc0318a67eb435af2a23897074ee7e..c09422d1922369306465919cd96684842c505296 100644 --- a/clang/test/Analysis/string.cpp +++ b/clang/test/Analysis/string.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,debug.ExprInspection -verify %s +// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection -verify %s // Test functions that are called "memcpy" but aren't the memcpy // we're looking for. Unfortunately, this test cannot be put into @@ -6,6 +6,7 @@ // as a normal C function for the test to make sense. typedef __typeof(sizeof(int)) size_t; void *memcpy(void *, const void *, size_t); +size_t strlen(const char *s); int sprintf(char *str, const char *format, ...); int snprintf(char *str, size_t size, const char *format, ...); @@ -45,3 +46,10 @@ void log(const char* fmt, const Args&... args) { void test_gh_74269_no_crash() { log("%d", 1); } + +struct TestNotNullTerm { + void test1() { + TestNotNullTerm * const &x = this; + strlen((char *)&x); // expected-warning{{Argument to string length function is not a null-terminated string}} + } +}; diff --git a/clang/test/C/C2y/n3262.c b/clang/test/C/C2y/n3262.c index 3ff2062d88dde898cb16374c8ee23b3c4a18571f..864ab351bdbc231b4a22beba6cd618b339535c58 100644 --- a/clang/test/C/C2y/n3262.c +++ b/clang/test/C/C2y/n3262.c @@ -1,20 +1,20 @@ -// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s -// expected-no-diagnostics - -/* WG14 N3262: Yes - * Usability of a byte-wise copy of va_list - * - * NB: Clang explicitly documents this as being undefined behavior. A - * diagnostic is produced for some targets but not for others for assignment or - * initialization, but no diagnostic is possible to produce for use with memcpy - * in the general case, nor with a manual bytewise copy via a for loop. - * - * Therefore, nothing is tested in this file; it serves as a reminder that we - * validated our documentation against the paper. See - * clang/docs/LanguageExtensions.rst for more details. - * - * FIXME: it would be nice to add ubsan support for recognizing when an invalid - * copy is made and diagnosing on copy (or on use of the copied va_list). - */ - -int main() {} +// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s +// expected-no-diagnostics + +/* WG14 N3262: Yes + * Usability of a byte-wise copy of va_list + * + * NB: Clang explicitly documents this as being undefined behavior. A + * diagnostic is produced for some targets but not for others for assignment or + * initialization, but no diagnostic is possible to produce for use with memcpy + * in the general case, nor with a manual bytewise copy via a for loop. + * + * Therefore, nothing is tested in this file; it serves as a reminder that we + * validated our documentation against the paper. See + * clang/docs/LanguageExtensions.rst for more details. + * + * FIXME: it would be nice to add ubsan support for recognizing when an invalid + * copy is made and diagnosing on copy (or on use of the copied va_list). + */ + +int main() {} diff --git a/clang/test/C/C2y/n3274.c b/clang/test/C/C2y/n3274.c index ccdb89f4069ded80ba2ef5bb48a5eda9722b1110..6bf8d72d0f3319cda65ad16a67f625a4c7e066f5 100644 --- a/clang/test/C/C2y/n3274.c +++ b/clang/test/C/C2y/n3274.c @@ -1,18 +1,18 @@ -// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s -// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s - -/* WG14 N3274: Yes - * Remove imaginary types - */ - -// Clang has never supported _Imaginary. -#ifdef __STDC_IEC_559_COMPLEX__ -#error "When did this happen?" -#endif - -_Imaginary float i; // expected-error {{imaginary types are not supported}} - -// _Imaginary is a keyword in older language modes, but doesn't need to be one -// in C2y or later. However, to improve diagnostic behavior, we retain it as a -// keyword in all language modes -- it is not available as an identifier. -static_assert(!__is_identifier(_Imaginary)); +// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s +// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s + +/* WG14 N3274: Yes + * Remove imaginary types + */ + +// Clang has never supported _Imaginary. +#ifdef __STDC_IEC_559_COMPLEX__ +#error "When did this happen?" +#endif + +_Imaginary float i; // expected-error {{imaginary types are not supported}} + +// _Imaginary is a keyword in older language modes, but doesn't need to be one +// in C2y or later. However, to improve diagnostic behavior, we retain it as a +// keyword in all language modes -- it is not available as an identifier. +static_assert(!__is_identifier(_Imaginary)); diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp index 70064f867e18e351c27f89e8eff89dd96c1bd635..f144e14cd122f9a8715994baa383a6877efebf3a 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp @@ -1,175 +1,219 @@ // RUN: %clang_cc1 -std=c++20 -verify %s // expected-no-diagnostics -template -concept D = true; +namespace Primary { + template + concept D = true; -template -struct A { - template - void f() requires V; + template + struct A { + template + void f() requires V; - template<> - void f(); + template<> + void f(); + + template + void g(); + + template requires V + struct B; + + template requires V + struct B; + + template<> + struct B; + + template + struct C; + + template + struct C; + template requires V + static int x; + + template requires V + static int x; + + template<> + int x; + + template + static int y; + + template + static int y; + }; + + template + template + void A::f() requires V { } + + template template - void g(); + void A::g() { } + template template requires V - struct B; + struct A::B { }; + template template requires V - struct B; + struct A::B { }; - template<> - struct B; + template + template requires V + struct A::B { }; + template template - struct C; + struct A::C { }; + template template - struct C; + struct A::C { }; + template template requires V - static int x; + int A::x = 0; + template template requires V - static int x; + int A::x = 0; - template<> - int x; + template + template requires V + int A::x = 0; + template template - static int y; + int A::y = 0; + template template - static int y; -}; - -template -template -void A::f() requires V { } + int A::y = 0; -template -template -void A::g() { } - -template -template requires V -struct A::B { }; + template<> + template + void A::f() requires V; -template -template requires V -struct A::B { }; + template<> + template<> + void A::f(); -template -template requires V -struct A::B { }; + template<> + template<> + void A::f(); -template -template -struct A::C { }; + template<> + template + void A::g(); -template -template -struct A::C { }; + template<> + template requires V + struct A::B; -template -template requires V -int A::x = 0; + template<> + template<> + struct A::B; -template -template requires V -int A::x = 0; + template<> + template<> + struct A::B; -template -template requires V -int A::x = 0; + template<> + template requires V + struct A::B; -template -template -int A::y = 0; + template<> + template requires V + struct A::B; -template -template -int A::y = 0; + template<> + template + struct A::C; -template<> -template -void A::f() requires V; + template<> + template + struct A::C; -template<> -template<> -void A::f(); + template<> + template + struct A::C; -template<> -template<> -void A::f(); + template<> + template requires V + int A::x; -template<> -template -void A::g(); + template<> + template<> + int A::x; -template<> -template requires V -struct A::B; + template<> + template<> + int A::x; -template<> -template<> -struct A::B; + template<> + template requires V + int A::x; -template<> -template<> -struct A::B; + template<> + template requires V + int A::x; -template<> -template requires V -struct A::B; + template<> + template + int A::y; -template<> -template requires V -struct A::B; + template<> + template + int A::y; -template<> -template -struct A::C; + template<> + template + int A::y; +} // namespace Primary -template<> -template -struct A::C; +namespace Partial { + template + struct A; -template<> -template -struct A::C; + template + struct A + { + template requires U + void f(); -template<> -template requires V -int A::x; + template requires U + static const int x; -template<> -template<> -int A::x; + template requires U + struct B; + }; -template<> -template<> -int A::x; + template + template requires U + void A::f() { } -template<> -template requires V -int A::x; + template + template requires U + constexpr int A::x = 0; -template<> -template requires V -int A::x; + template + template requires U + struct A::B { }; -template<> -template -int A::y; + template<> + template requires true + void A::f() { } -template<> -template -int A::y; + template<> + template requires true + constexpr int A::x = 1; -template<> -template -int A::y; + template<> + template requires true + struct A::B { }; +} // namespace Partial diff --git a/clang/test/CodeGen/aarch64-sve.c b/clang/test/CodeGen/aarch64-sve.c index 5f6a0178aa4425f057acf3ef79425a69bb7cc07d..690b010e967ad7ccd91b2628c3b860da6607dfbc 100644 --- a/clang/test/CodeGen/aarch64-sve.c +++ b/clang/test/CodeGen/aarch64-sve.c @@ -13,6 +13,7 @@ // CHECK: %f16 = alloca , align 16 // CHECK: %f32 = alloca , align 16 // CHECK: %f64 = alloca , align 16 +// CHECK: %mf8 = alloca , align 16 // CHECK: %bf16 = alloca , align 16 // CHECK: %b8 = alloca , align 2 @@ -33,6 +34,7 @@ void test_locals(void) { __SVFloat32_t f32; __SVFloat64_t f64; + __SVMfloat8_t mf8; __SVBfloat16_t bf16; __SVBool_t b8; diff --git a/clang/test/CodeGen/aarch64-type-sizes.c b/clang/test/CodeGen/aarch64-type-sizes.c index a40423c1f8deb25f3ba07f037fe2323093a5c223..f6129b3943d2b9d99b170e5df34774218abfdc8a 100644 --- a/clang/test/CodeGen/aarch64-type-sizes.c +++ b/clang/test/CodeGen/aarch64-type-sizes.c @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck %s // char by definition has size 1 -// CHECK: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +// CHECK: target datalayout = "E-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" int check_short(void) { return sizeof(short); diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.cpp b/clang/test/CodeGen/attr-counted-by-pr88931.cpp index 2a8cc1d07e50d9acc22ec64cb92b04e455f0e37c..6d0c46bbbe8f9cbdf478fc2105ad520a1acaf334 100644 --- a/clang/test/CodeGen/attr-counted-by-pr88931.cpp +++ b/clang/test/CodeGen/attr-counted-by-pr88931.cpp @@ -13,7 +13,7 @@ void init(void * __attribute__((pass_dynamic_object_size(0)))); // CHECK-LABEL: define dso_local void @_ZN3foo3barC1Ev( // CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(1) [[THIS:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 { // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull align 4 dereferenceable(1) [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]] // CHECK-NEXT: ret void // foo::bar::bar() { diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c index 4a130c5e3d401f5c1874f40c72af4572033943a4..f70e552bca26ab04d534bb68d44508c29366c9c5 100644 --- a/clang/test/CodeGen/attr-counted-by.c +++ b/clang/test/CodeGen/attr-counted-by.c @@ -66,7 +66,7 @@ struct anon_struct { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10:[0-9]+]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9:[0-9]+]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 @@ -114,7 +114,7 @@ void test1(struct annotated *p, int index, int val) { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 @@ -197,42 +197,26 @@ size_t test2_bdos(struct annotated *p) { // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3( // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITH-ATTR-NEXT: entry: -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 // SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] -// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64 -// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2 -// SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP3]], i64 4) -// SANITIZE-WITH-ATTR-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 -// SANITIZE-WITH-ATTR-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 12 -// SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 -// SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP6]] -// SANITIZE-WITH-ATTR-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]] +// SANITIZE-WITH-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]] // SANITIZE-WITH-ATTR-NEXT: ret void // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 4) -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 12 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 -// NO-SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]] // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]] -// NO-SANITIZE-WITH-ATTR-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] +// NO-SANITIZE-WITH-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // NO-SANITIZE-WITH-ATTR-NEXT: ret void // // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3( @@ -254,34 +238,18 @@ size_t test2_bdos(struct annotated *p) { void test3(struct annotated *p, size_t index) { // This test differs from 'test2' by checking bdos on the whole array and not // just the FAM. - p->array[index] = __builtin_dynamic_object_size(p, 1); + p->array[index] = __builtin_dynamic_object_size(p, 0); } -// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos( +// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { // SANITIZE-WITH-ATTR-NEXT: entry: -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64 -// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2 -// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 4) -// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 12 -// SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1 -// SANITIZE-WITH-ATTR-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 0 -// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP5]] +// SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // -// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 4) -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 12 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 0 -// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP5]] +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos( // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { @@ -294,7 +262,7 @@ void test3(struct annotated *p, size_t index) { // NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 // size_t test3_bdos(struct annotated *p) { - return __builtin_dynamic_object_size(p, 1); + return __builtin_dynamic_object_size(p, 0); } // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4( @@ -308,7 +276,7 @@ size_t test3_bdos(struct annotated *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont4: // SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 2 @@ -325,7 +293,7 @@ size_t test3_bdos(struct annotated *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP7:%.*]] = icmp ult i64 [[IDXPROM12]], [[TMP6]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP7]], label [[CONT19:%.*]], label [[HANDLER_OUT_OF_BOUNDS15:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds15: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM12]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM12]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont19: // SANITIZE-WITH-ATTR-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD6]], 3 @@ -342,7 +310,7 @@ size_t test3_bdos(struct annotated *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP13:%.*]] = icmp ult i64 [[IDXPROM28]], [[TMP12]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP13]], label [[CONT35:%.*]], label [[HANDLER_OUT_OF_BOUNDS31:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds31: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM28]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM28]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont35: // SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM28]] @@ -488,39 +456,27 @@ size_t test4_bdos(struct annotated *p, int index) { // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5( // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITH-ATTR-NEXT: entry: -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4 // SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 -// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[DOTCOUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 // SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]] -// SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0 -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32 -// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2 -// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 16 -// SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP3]] -// SANITIZE-WITH-ATTR-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]] +// SANITIZE-WITH-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]] // SANITIZE-WITH-ATTR-NEXT: ret void // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 16 -// NO-SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP1]] -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 // NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 -// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP2]], i64 0, i64 [[IDXPROM]] -// NO-SANITIZE-WITH-ATTR-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]] // NO-SANITIZE-WITH-ATTR-NEXT: ret void // // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test5( @@ -545,27 +501,15 @@ void test5(struct anon_struct *p, int index) { p->array[index] = __builtin_dynamic_object_size(p, 1); } -// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos( +// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2 -// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 16 -// SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0 -// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = select i1 [[DOTINV]], i64 0, i64 [[TMP1]] -// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP2]] +// SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // -// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 16 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = select i1 [[DOTINV]], i64 0, i64 [[TMP1]] -// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP2]] +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5_bdos( // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { @@ -590,7 +534,7 @@ size_t test5_bdos(struct anon_struct *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 @@ -683,7 +627,7 @@ size_t test6_bdos(struct anon_struct *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont7: // SANITIZE-WITH-ATTR-NEXT: [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9 @@ -723,12 +667,12 @@ void test7(struct union_of_fams *p, int index) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7_bdos( -// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { +// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7_bdos( -// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // @@ -756,7 +700,7 @@ size_t test7_bdos(struct union_of_fams *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont7: // SANITIZE-WITH-ATTR-NEXT: [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9 @@ -837,7 +781,7 @@ size_t test8_bdos(struct union_of_fams *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB14:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB14:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont7: // SANITIZE-WITH-ATTR-NEXT: [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 @@ -877,12 +821,12 @@ void test9(struct union_of_fams *p, int index) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test9_bdos( -// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test9_bdos( -// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // @@ -910,7 +854,7 @@ size_t test9_bdos(struct union_of_fams *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont7: // SANITIZE-WITH-ATTR-NEXT: [[BYTES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12 @@ -997,7 +941,7 @@ size_t test10_bdos(struct union_of_fams *p) { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 @@ -1037,12 +981,12 @@ void test11(struct annotated *p, int index) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test11_bdos( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 4 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test11_bdos( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 4 // @@ -1076,16 +1020,16 @@ struct hang { int test12_a, test12_b; // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12( -// SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { +// SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4 -// SANITIZE-WITH-ATTR-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR11:[0-9]+]] +// SANITIZE-WITH-ATTR-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR10:[0-9]+]] // SANITIZE-WITH-ATTR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT9:![0-9]+]] // SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 6 // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = zext i32 [[INDEX]] to i64 // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP0]], label [[CONT:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[TMP1]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont: // SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i32], ptr [[BAZ]], i64 0, i64 [[TMP1]] @@ -1095,17 +1039,17 @@ int test12_a, test12_b; // SANITIZE-WITH-ATTR-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0 // SANITIZE-WITH-ATTR-NEXT: br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF10:![0-9]+]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds4: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.type_mismatch6: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12( -// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR12:[0-9]+]] +// NO-SANITIZE-WITH-ATTR-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR11:[0-9]+]] // NO-SANITIZE-WITH-ATTR-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT7:![0-9]+]] // NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i32], ptr [[BAZ]], i64 0, i64 [[IDXPROM]] @@ -1188,7 +1132,7 @@ struct test13_bar { // SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont5: // SANITIZE-WITH-ATTR-NEXT: [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 @@ -1197,7 +1141,7 @@ struct test13_bar { // SANITIZE-WITH-ATTR-NEXT: ret i32 0 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test13( -// NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA8:![0-9]+]] // NO-SANITIZE-WITH-ATTR-NEXT: [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16 @@ -1249,14 +1193,14 @@ struct test14_foo { // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: // SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64 -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: trap: -// SANITIZE-WITH-ATTR-NEXT: tail call void @llvm.trap() #[[ATTR10]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @llvm.trap() #[[ATTR9]] // SANITIZE-WITH-ATTR-NEXT: unreachable // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test14( -// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca [[STRUCT_TEST14_FOO:%.*]], align 4 // NO-SANITIZE-WITH-ATTR-NEXT: store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[TBAA2]] @@ -1305,14 +1249,14 @@ int test14(int idx) { // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: // SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64 -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: trap: -// SANITIZE-WITH-ATTR-NEXT: tail call void @llvm.trap() #[[ATTR10]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @llvm.trap() #[[ATTR9]] // SANITIZE-WITH-ATTR-NEXT: unreachable // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test15( -// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64 // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds nuw (i8, ptr @__const.test15.foo, i64 8), i64 0, i64 [[IDXPROM]] @@ -1350,12 +1294,12 @@ int test15(int idx) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test19( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test19( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // @@ -1375,12 +1319,12 @@ size_t test19(struct annotated *p) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test20( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test20( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // @@ -1400,12 +1344,12 @@ size_t test20(struct annotated *p) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test21( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test21( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // @@ -1425,12 +1369,12 @@ size_t test21(struct annotated *p) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test22( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test22( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // @@ -1450,12 +1394,12 @@ size_t test22(struct annotated *p) { } // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test23( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test23( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 // @@ -1487,7 +1431,7 @@ struct tests_foo { // SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10 // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP0]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 10) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont4: // SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84 @@ -1528,7 +1472,7 @@ int test24(int c, struct tests_foo *var) { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10 // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 10) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont5: // SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44 @@ -1536,7 +1480,7 @@ int test24(int c, struct tests_foo *var) { // SANITIZE-WITH-ATTR-NEXT: ret i32 [[TMP2]] // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test25( -// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR9:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA11]] // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44 @@ -1580,7 +1524,7 @@ struct test26_foo { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont5: // SANITIZE-WITH-ATTR-NEXT: [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8 @@ -1651,7 +1595,7 @@ struct test27_foo { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24 @@ -1717,7 +1661,7 @@ struct test28_foo { // SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP4]], label [[CONT17:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont17: // SANITIZE-WITH-ATTR-NEXT: [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12 @@ -1726,7 +1670,7 @@ struct test28_foo { // SANITIZE-WITH-ATTR-NEXT: ret i32 [[TMP5]] // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test28( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR9]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR8]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA11]] // NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA11]] @@ -1779,7 +1723,7 @@ struct annotated_struct_array { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = zext i32 [[IDX1]] to i64 // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB36:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB36:[0-9]+]], i64 [[TMP1]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont3: // SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]] @@ -1791,7 +1735,7 @@ struct annotated_struct_array { // SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = icmp ult i64 [[IDXPROM15]], [[TMP3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP4]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds16: -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // SANITIZE-WITH-ATTR: cont20: // SANITIZE-WITH-ATTR-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12 @@ -1803,7 +1747,7 @@ struct annotated_struct_array { // SANITIZE-WITH-ATTR-NEXT: ret void // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test29( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR10:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR9:[0-9]+]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64 // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]] @@ -1865,26 +1809,19 @@ struct test30_struct { }; // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30( -// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR5]] { +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META2]] -// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[TMP0]]) #[[ATTR10]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[TMP0]]) #[[ATTR9]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 4) -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], 12 -// NO-SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i8 0, i8 [[TMP2]] // NO-SANITIZE-WITH-ATTR-NEXT: [[PCPU_REFCNT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 12 // NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64 // NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[PCPU_REFCNT]], i64 0, i64 [[IDXPROM]] -// NO-SANITIZE-WITH-ATTR-NEXT: store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]] +// NO-SANITIZE-WITH-ATTR-NEXT: store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]] // NO-SANITIZE-WITH-ATTR-NEXT: ret void // // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test30( @@ -1916,30 +1853,14 @@ struct test31_struct { }; // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31( -// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] { // SANITIZE-WITH-ATTR-NEXT: entry: -// SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4 -// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64 -// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2 -// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0) -// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -// SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 4 -// SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 -// SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]] -// SANITIZE-WITH-ATTR-NEXT: ret i32 [[CONV]] +// SANITIZE-WITH-ATTR-NEXT: ret i32 -1 // // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31( -// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] { +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0) -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 4 -// NO-SANITIZE-WITH-ATTR-NEXT: [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0 -// NO-SANITIZE-WITH-ATTR-NEXT: [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]] -// NO-SANITIZE-WITH-ATTR-NEXT: ret i32 [[CONV]] +// NO-SANITIZE-WITH-ATTR-NEXT: ret i32 -1 // // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31( // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] { diff --git a/clang/test/CodeGen/code-coverage.c b/clang/test/CodeGen/code-coverage.c index d7994bab35d81a0d69ca0b9833a5b4ccefd2d909..4e3364df21785401b08a5a7ce2f84f1a261b3703 100644 --- a/clang/test/CodeGen/code-coverage.c +++ b/clang/test/CodeGen/code-coverage.c @@ -3,12 +3,18 @@ /// 4.7 enables cfg_checksum. /// 4.8 (default, compatible with gcov 7) emits the exit block the second. // RUN: rm -rf %t && mkdir %t && cd %t -// RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,304 %s -// RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,407 %s -// RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \ -// RUN: FileCheck --check-prefixes=CHECK,408 %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,304 %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,407 %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,408 %s +// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,304 %s +// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,407 %s +// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,408 %s // RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO @@ -49,10 +55,13 @@ int test2(int b) { /// 0x3430382a '4' '0' '8' '*' // 408-SAME: i32 875575338 +// Check for gcov initialization function pointers. +// CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit" + // Check that the noredzone flag is set on the generated functions. // CHECK: void @__llvm_gcov_writeout() unnamed_addr [[NRZ:#[0-9]+]] -// CHECK: void @__llvm_gcov_init() unnamed_addr [[NRZ]] +// CHECK-CTOR-INIT: void @__llvm_gcov_init() unnamed_addr [[NRZ]] // CHECK: attributes [[NRZ]] = { {{.*}}noredzone{{.*}} } diff --git a/clang/test/CodeGen/coff-aarch64-type-sizes.c b/clang/test/CodeGen/coff-aarch64-type-sizes.c index 9cb0ddbaef3f65e6e4877d2be1e371340abfe5a1..2e5b94c14e6acf9608a3e6831a2ee71c421f6b64 100644 --- a/clang/test/CodeGen/coff-aarch64-type-sizes.c +++ b/clang/test/CodeGen/coff-aarch64-type-sizes.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -triple aarch64-windows -emit-llvm -w -o - %s | FileCheck %s -// CHECK: target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" +// CHECK: target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" // CHECK: target triple = "aarch64-unknown-windows-msvc" int check_short(void) { diff --git a/clang/test/CodeGen/ms-mixed-ptr-sizes.c b/clang/test/CodeGen/ms-mixed-ptr-sizes.c index 0bc1925b13dbc66c4b60a3d90b22be83480932b5..f99c6196557e1892544afc13c86f0ef90776f143 100644 --- a/clang/test/CodeGen/ms-mixed-ptr-sizes.c +++ b/clang/test/CodeGen/ms-mixed-ptr-sizes.c @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -emit-llvm -O2 < %s | FileCheck %s --check-prefixes=X64,ALL // RUN: %clang_cc1 -triple i386-pc-win32 -fms-extensions -emit-llvm -O2 < %s | FileCheck %s --check-prefixes=X86,ALL +// RUN: %clang_cc1 -triple aarch64-windows-msvc -fms-extensions -emit-llvm -O2 < %s | FileCheck %s --check-prefixes=AARCH64,ALL struct Foo { int * __ptr32 p32; @@ -9,32 +10,40 @@ void use_foo(struct Foo *f); void test_sign_ext(struct Foo *f, int * __ptr32 __sptr i) { // X64-LABEL: define dso_local void @test_sign_ext({{.*}}ptr addrspace(270) noundef %i) // X86-LABEL: define dso_local void @test_sign_ext(ptr noundef %f, ptr noundef %i) +// AARCH64-LABEL: define dso_local void @test_sign_ext({{.*}}ptr addrspace(270) noundef %i) local_unnamed_addr #0 // X64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272) +// AARCH64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr f->p64 = i; use_foo(f); } void test_zero_ext(struct Foo *f, int * __ptr32 __uptr i) { // X64-LABEL: define dso_local void @test_zero_ext({{.*}}ptr addrspace(271) noundef %i) // X86-LABEL: define dso_local void @test_zero_ext({{.*}}ptr addrspace(271) noundef %i) +// AARCH64-LABEL: define dso_local void @test_zero_ext({{.*}}ptr addrspace(271) noundef %i) local_unnamed_addr #0 // X64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272) +// AARCH64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr f->p64 = i; use_foo(f); } void test_trunc(struct Foo *f, int * __ptr64 i) { // X64-LABEL: define dso_local void @test_trunc(ptr noundef %f, ptr noundef %i) // X86-LABEL: define dso_local void @test_trunc({{.*}}ptr addrspace(272) noundef %i) +// AARCH64-LABEL: define dso_local void @test_trunc(ptr noundef %f, ptr noundef %i) local_unnamed_addr #0 // X64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(270) // X86: %{{.+}} = addrspacecast ptr addrspace(272) %i to ptr +// AARCH64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(270) f->p32 = i; use_foo(f); } void test_noop(struct Foo *f, int * __ptr32 i) { // X64-LABEL: define dso_local void @test_noop({{.*}}ptr addrspace(270) noundef %i) // X86-LABEL: define dso_local void @test_noop({{.*}}ptr noundef %i) +// AARCH64-LABEL: define dso_local void @test_noop({{.*}}ptr addrspace(270) noundef %i) local_unnamed_addr #0 // X64-NOT: addrspacecast // X86-NOT: addrspacecast +// AARCH64-NOT: addrspacecast f->p32 = i; use_foo(f); } @@ -42,8 +51,10 @@ void test_noop(struct Foo *f, int * __ptr32 i) { void test_other(struct Foo *f, __attribute__((address_space(10))) int *i) { // X64-LABEL: define dso_local void @test_other({{.*}}ptr addrspace(10) noundef %i) // X86-LABEL: define dso_local void @test_other({{.*}}ptr addrspace(10) noundef %i) +// AARCH64-LABEL: define dso_local void @test_other({{.*}}ptr addrspace(10) noundef %i) local_unnamed_addr #0 // X64: %{{.+}} = addrspacecast ptr addrspace(10) %i to ptr addrspace(270) // X86: %{{.+}} = addrspacecast ptr addrspace(10) %i to ptr +// AARCH64: %{{.+}} = addrspacecast ptr addrspace(10) %i to ptr addrspace(270) f->p32 = (int * __ptr32)i; use_foo(f); } @@ -54,6 +65,8 @@ int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) { // X64: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}} // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr addrspace(271) // X86: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}} + // AARCH64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(271) + // AARCH64: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}} return (i == j); } @@ -63,6 +76,8 @@ int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) { // X64: %cmp = icmp eq ptr addrspace(270) %i, %{{.+}} // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr // X86: %cmp = icmp eq ptr %i, %{{.+}} + // AARCH64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(270) + // AARCH64: %cmp = icmp eq ptr addrspace(270) %i, %{{.+}} return (i == j); } @@ -72,6 +87,8 @@ int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) { // X64: %cmp = icmp eq ptr %j, %{{.+}} // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272) // X86: %cmp = icmp eq ptr addrspace(272) %j, %{{.+}} + // AARCH64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr + // AARCH64: %cmp = icmp eq ptr %j, %{{.+}} return (j == i); } @@ -81,5 +98,7 @@ int test_compare4(int *__ptr32 __sptr i, int *__ptr64 j) { // X64: %cmp = icmp eq ptr %j, %{{.+}} // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272) // X86: %cmp = icmp eq ptr addrspace(272) %j, %{{.+}} + // AARCH64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr + // AARCH64: %cmp = icmp eq ptr %j, %{{.+}} return (j == i); } diff --git a/clang/test/CodeGen/sanitize-coverage-gated-callbacks.c b/clang/test/CodeGen/sanitize-coverage-gated-callbacks.c new file mode 100644 index 0000000000000000000000000000000000000000..9a00d91d5ad0864e51bf9665a43a1019125706d4 --- /dev/null +++ b/clang/test/CodeGen/sanitize-coverage-gated-callbacks.c @@ -0,0 +1,42 @@ +// RUN: %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=trace-pc-guard -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o - | FileCheck %s --check-prefixes=CHECK,GATED +// RUN: %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=trace-pc-guard -mllvm -sanitizer-coverage-gated-trace-callbacks=0 -o - | FileCheck %s --check-prefixes=CHECK,PLAIN +// RUN: not %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=trace-pc -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o /dev/null 2>&1 | FileCheck %s --check-prefixes=INCOMPATIBLE +// RUN: not %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=inline-8bit-counters -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o /dev/null 2>&1 | FileCheck %s --check-prefixes=INCOMPATIBLE +// RUN: not %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=inline-bool-flag -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o /dev/null 2>&1 | FileCheck %s --check-prefixes=INCOMPATIBLE + +// Verify that we do not emit the __sancov_gate section for "plain" trace-pc-guard +// GATED: section "__DATA,__sancov_gate" +// PLAIN-NOT: section "__DATA,__sancov_gate" + +// Produce an error for all incompatible sanitizer coverage modes. +// INCOMPATIBLE: error: 'sanitizer-coverage-gated-trace-callbacks' is only supported with trace-pc-guard + +int x[10]; + +// CHECK: define{{.*}} void @foo +void foo(int n, int m) { + // COM: Verify that we're emitting the call to __sanitizer_cov_trace_pc_guard upon + // COM: checking the value of __sancov_should_track. + // GATED: [[VAL:%.*]] = load i64, {{.*}}@__sancov_should_track + // GATED-NOT: [[VAL:%.*]] = load i64, i64* @__sancov_should_track + // GATED-NEXT: [[CMP:%.*]] = icmp ne i64 [[VAL]], 0 + // GATED-NEXT: br i1 [[CMP]], label %[[L_TRUE:.*]], label %[[L_FALSE:.*]], !prof [[WEIGHTS:!.+]] + // GATED: [[L_TRUE]]: + // GATED-NEXT: call void @__sanitizer_cov_trace_pc_guard + // GATED: br i1 [[CMP]], label %[[L_TRUE_2:.*]], label %[[L_FALSE_2:.*]] + // GATED: [[L_TRUE_2]]: + // GATED-NEXT: call void @__sanitizer_cov_trace_pc_guard + // GATED: [[WEIGHTS]] = !{!"branch_weights", i32 1, i32 100000} + + // COM: With the non-gated instrumentation, we should not emit the + // COM: __sancov_should_track global. + // PLAIN-NOT: __sancov_should_track + // But we should still be emitting the calls to the callback. + // PLAIN: call void @__sanitizer_cov_trace_pc_guard + if (n) { + x[n] = 42; + if (m) { + x[m] = 41; + } + } +} diff --git a/clang/test/CodeGen/stack-protector-guard.c b/clang/test/CodeGen/stack-protector-guard.c index 4777367c94e733527525052d4d8a30b2cff41116..82616ae800c4263f9f6d4d83da8825438bb28dfb 100644 --- a/clang/test/CodeGen/stack-protector-guard.c +++ b/clang/test/CodeGen/stack-protector-guard.c @@ -12,6 +12,12 @@ // RUN: %clang_cc1 -mstack-protector-guard=tls -triple riscv64-unknown-elf \ // RUN: -mstack-protector-guard-offset=44 -mstack-protector-guard-reg=tp \ // RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=RISCV +// RUN: %clang_cc1 -mstack-protector-guard=tls -triple powerpc64-unknown-elf \ +// RUN: -mstack-protector-guard-offset=52 -mstack-protector-guard-reg=r13 \ +// RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=POWERPC64 +// RUN: %clang_cc1 -mstack-protector-guard=tls -triple ppc32-unknown-elf \ +// RUN: -mstack-protector-guard-offset=16 -mstack-protector-guard-reg=r2 \ +// RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=POWERPC32 void foo(int*); void bar(int x) { int baz[x]; @@ -31,3 +37,13 @@ void bar(int x) { // RISCV: [[ATTR1]] = !{i32 1, !"stack-protector-guard", !"tls"} // RISCV: [[ATTR2]] = !{i32 1, !"stack-protector-guard-reg", !"tp"} // RISCV: [[ATTR3]] = !{i32 1, !"stack-protector-guard-offset", i32 44} + +// POWERPC64: !llvm.module.flags = !{{{.*}}[[ATTR1:![0-9]+]], [[ATTR2:![0-9]+]], [[ATTR3:![0-9]+]], [[ATTR4:![0-9]+]]} +// POWERPC64: [[ATTR2]] = !{i32 1, !"stack-protector-guard", !"tls"} +// POWERPC64: [[ATTR3]] = !{i32 1, !"stack-protector-guard-reg", !"r13"} +// POWERPC64: [[ATTR4]] = !{i32 1, !"stack-protector-guard-offset", i32 52} + +// POWERPC32: !llvm.module.flags = !{{{.*}}[[ATTR1:![0-9]+]], [[ATTR2:![0-9]+]], [[ATTR3:![0-9]+]], [[ATTR4:![0-9]+]]} +// POWERPC32: [[ATTR2]] = !{i32 1, !"stack-protector-guard", !"tls"} +// POWERPC32: [[ATTR3]] = !{i32 1, !"stack-protector-guard-reg", !"r2"} +// POWERPC32: [[ATTR4]] = !{i32 1, !"stack-protector-guard-offset", i32 16} diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index 8548aa00cfe8773c5490b4a89a628463fb0b8e59..26a1bf2a1a574076835b3206af37e665b0671433 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -185,15 +185,15 @@ // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=AARCH64 -// AARCH64: target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +// AARCH64: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" // RUN: %clang_cc1 -triple arm64_32-apple-ios7.0 -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=AARCH64-ILP32 -// AARCH64-ILP32: target datalayout = "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32" +// AARCH64-ILP32: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" // RUN: %clang_cc1 -triple arm64-pc-win32-macho -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=AARCH64-WIN32-MACHO -// AARCH64-WIN32-MACHO: target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32" +// AARCH64-WIN32-MACHO: target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" // RUN: %clang_cc1 -triple thumb-unknown-gnueabi -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=THUMB diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 8bf8241e343e70243266afe09d37f8c16dc28a91..efe75be8488b3c2a2cd0b762e6c258bcd770fb09 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -26,15 +26,19 @@ __global__ void ffp1(float *p) { // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}} // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}} // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}} - // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4{{$}} - // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}} + // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]]{{$}} + // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]], !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // SAFE: _Z4ffp1Pf // SAFE: global_atomic_cmpswap @@ -56,6 +60,9 @@ __global__ void ffp1(float *p) { __atomic_fetch_sub(p, 1.0f, memory_order_relaxed); __atomic_fetch_max(p, 1.0f, memory_order_relaxed); __atomic_fetch_min(p, 1.0f, memory_order_relaxed); + + __hip_atomic_fetch_add(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_sub(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); } @@ -66,15 +73,19 @@ __global__ void ffp2(double *p) { // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}} // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}} // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}} - // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}} - // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}} + // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // SAFE-LABEL: @_Z4ffp2Pd // SAFE: global_atomic_cmpswap_b64 @@ -95,8 +106,10 @@ __global__ void ffp2(double *p) { __atomic_fetch_sub(p, 1.0, memory_order_relaxed); __atomic_fetch_max(p, 1.0, memory_order_relaxed); __atomic_fetch_min(p, 1.0, memory_order_relaxed); - __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); - __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_fetch_add(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_sub(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_fetch_max(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_min(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); } // long double is the same as double for amdgcn. @@ -106,15 +119,19 @@ __global__ void ffp3(long double *p) { // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}} // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}} // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}} - // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}} - // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}} + // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // SAFE-LABEL: @_Z4ffp3Pe // SAFE: global_atomic_cmpswap_b64 @@ -132,8 +149,10 @@ __global__ void ffp3(long double *p) { __atomic_fetch_sub(p, 1.0L, memory_order_relaxed); __atomic_fetch_max(p, 1.0L, memory_order_relaxed); __atomic_fetch_min(p, 1.0L, memory_order_relaxed); - __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); - __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_fetch_add(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_sub(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_fetch_max(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_min(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); } __device__ double ffp4(double *p, float f) { @@ -141,7 +160,11 @@ __device__ double ffp4(double *p, float f) { // CHECK: fpext float {{.*}} to double // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}} // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - return __atomic_fetch_sub(p, f, memory_order_relaxed); + + // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + __atomic_fetch_sub(p, f, memory_order_relaxed); + return __hip_atomic_fetch_sub(p, f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); } __device__ double ffp5(double *p, int i) { @@ -149,7 +172,11 @@ __device__ double ffp5(double *p, int i) { // CHECK: sitofp i32 {{.*}} to double // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}} // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - return __atomic_fetch_sub(p, i, memory_order_relaxed); + __atomic_fetch_sub(p, i, memory_order_relaxed); + + // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + return __hip_atomic_fetch_sub(p, i, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); } __global__ void ffp6(_Float16 *p) { @@ -158,15 +185,19 @@ __global__ void ffp6(_Float16 *p) { // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}} // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}} // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}} - // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2{{$}} - // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2{{$}} + // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} + // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} - // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // SAFE: _Z4ffp6PDF16 // SAFE: global_atomic_cmpswap @@ -187,6 +218,25 @@ __global__ void ffp6(_Float16 *p) { __atomic_fetch_sub(p, 1.0, memory_order_relaxed); __atomic_fetch_max(p, 1.0, memory_order_relaxed); __atomic_fetch_min(p, 1.0, memory_order_relaxed); + + __hip_atomic_fetch_add(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_sub(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); } + +// CHECK-LABEL: @_Z12test_cmpxchgPiii +// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}} +// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}} +// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} +// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}} +__device__ int test_cmpxchg(int *ptr, int cmp, int desired) { + bool flag = __atomic_compare_exchange(ptr, &cmp, &desired, 0, memory_order_acquire, memory_order_acquire); + flag = __atomic_compare_exchange_n(ptr, &cmp, desired, 1, memory_order_acquire, memory_order_acquire); + flag = __hip_atomic_compare_exchange_strong(ptr, &cmp, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + flag = __hip_atomic_compare_exchange_weak(ptr, &cmp, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + return flag; +} + +// SAFEIR: ![[$NO_PRIVATE]] = !{i32 5, i32 6} +// UNSAFEIR: ![[$NO_PRIVATE]] = !{i32 5, i32 6} diff --git a/clang/test/CodeGenCUDA/atomic-ops.cu b/clang/test/CodeGenCUDA/atomic-ops.cu index fbc042caa809f981f3b9bbab6e46d3e0e85c0867..1accd1712becaf82eddb2a282bc85ce17f034283 100644 --- a/clang/test/CodeGenCUDA/atomic-ops.cu +++ b/clang/test/CodeGenCUDA/atomic-ops.cu @@ -1,19 +1,19 @@ -// RUN: %clang_cc1 -x hip -std=c++11 -triple amdgcn -fcuda-is-device -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -x hip -std=c++11 -triple amdgcn -fcuda-is-device -emit-llvm %s -o - | FileCheck -enable-var-scope %s #include "Inputs/cuda.h" // CHECK-LABEL: @_Z24atomic32_op_singlethreadPiii -// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4 -// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 4 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK:[0-9]+]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4{{$}} +// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 4{{$}} __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -31,8 +31,8 @@ __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) { } // CHECK-LABEL: @_Z25atomicu32_op_singlethreadPjjj -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned int val, unsigned int desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -40,18 +40,18 @@ __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned in } // CHECK-LABEL: @_Z21atomic32_op_wavefrontPiii -// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4 -// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4{{$}} +// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4{{$}} __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); @@ -69,8 +69,8 @@ __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) { } // CHECK-LABEL: @_Z22atomicu32_op_wavefrontPjjj -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int val, unsigned int desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); @@ -78,17 +78,17 @@ __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int v } // CHECK-LABEL: @_Z21atomic32_op_workgroupPiii -// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}} __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); @@ -105,8 +105,8 @@ __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) { } // CHECK-LABEL: @_Z22atomicu32_op_workgroupPjjj -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int val, unsigned int desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); @@ -114,17 +114,17 @@ __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int v } // CHECK-LABEL: @_Z17atomic32_op_agentPiii -// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4{{$}} __device__ int atomic32_op_agent(int *ptr, int val, int desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -141,8 +141,8 @@ __device__ int atomic32_op_agent(int *ptr, int val, int desired) { } // CHECK-LABEL: @_Z18atomicu32_op_agentPjjj -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val, unsigned int desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -150,18 +150,18 @@ __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val, } // CHECK-LABEL: @_Z18atomic32_op_systemPiii -// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: load i32, ptr %{{.*}}, align 4 -// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 4 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: load i32, ptr %{{.*}}, align 4{{$}} +// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 4{{$}} __device__ int atomic32_op_system(int *ptr, int val, int desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); @@ -179,8 +179,8 @@ __device__ int atomic32_op_system(int *ptr, int val, int desired) { } // CHECK-LABEL: @_Z19atomicu32_op_systemPjjj -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val, unsigned int desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); @@ -188,17 +188,17 @@ __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val, } // CHECK-LABEL: @_Z24atomic64_op_singlethreadPxS_xx -// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}} __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, long long val, long long desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -215,10 +215,10 @@ __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, l } // CHECK-LABEL: @_Z25atomicu64_op_singlethreadPyS_yy -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") -// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8 -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8 +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}} __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -228,18 +228,18 @@ __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr, } // CHECK-LABEL: @_Z21atomic64_op_wavefrontPxS_xx -// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8 -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}} __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long long val, long long desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); @@ -257,10 +257,10 @@ __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long } // CHECK-LABEL: @_Z22atomicu64_op_wavefrontPyS_yy -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") -// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8 -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8 +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}} __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); @@ -270,17 +270,17 @@ __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, un } // CHECK-LABEL: @_Z21atomic64_op_workgroupPxS_xx -// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}} __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long long val, long long desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); @@ -297,9 +297,9 @@ __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long } // CHECK-LABEL: @_Z22atomicu64_op_workgroupPyS_yy -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8 +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}} __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); @@ -308,17 +308,17 @@ __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, un } // CHECK-LABEL: @_Z17atomic64_op_agentPxS_xx -// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8 +// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}} __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long long val, long long desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -335,9 +335,9 @@ __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long lon } // CHECK-LABEL: @_Z18atomicu64_op_agentPyS_yy -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8 +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}} __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -346,18 +346,18 @@ __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsign } // CHECK-LABEL: @_Z18atomic64_op_systemPxS_xx -// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8 -// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") +// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} // CHECK: load i64, ptr %{{.*}}, align 8 -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8 +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}} __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long long val, long long desired) { bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); @@ -375,10 +375,10 @@ __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long lo } // CHECK-LABEL: @_Z19atomicu64_op_systemPyS_yy -// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") -// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") +// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} +// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}} // CHECK: load i64, ptr %{{.*}}, align 8 -// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8 +// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}} __device__ unsigned long long atomicu64_op_system(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) { val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); @@ -386,3 +386,5 @@ __device__ unsigned long long atomicu64_op_system(unsigned long long *ptr, unsig __hip_atomic_store(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); return val; } + +// [[$NOALIAS_ADDRSPACE_STACK]] = !{i32 5, i32 6} diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp index bb86d459b02eafeba631fabce4769e95c31500eb..e6a945618badc428cda2f60cdc84e879453f5589 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp @@ -4,8 +4,8 @@ // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s // The inline function is emitted in each module with the same comdat -// CHECK: $_ZTS1A = comdat any // CHECK: $_ZTI1A = comdat any +// CHECK: $_ZTS1A = comdat any // CHECK: $_ZTI1B.rtti_proxy = comdat any // The VTable is emitted everywhere used diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp index d5d9a85d4e22f4221f03ee52236a5dd719e52752..70f8289e9df37afa698d3a07953306c2bfcdf196 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp @@ -4,8 +4,8 @@ // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm | FileCheck %s // CHECK: $_ZTV1A = comdat any -// CHECK: $_ZTS1A = comdat any // CHECK: $_ZTI1A = comdat any +// CHECK: $_ZTS1A = comdat any // CHECK: $_ZTI1A.rtti_proxy = comdat any // The VTable is linkonce_odr and in a comdat here bc it’s key function is inline defined. diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp index a033ac41868f566d32a2f90f40938b10a528a87b..c1b9a9398219a82e76603242f9201be3c1331933 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp @@ -8,12 +8,12 @@ // CHECK: $_ZN1A3fooEv = comdat any // CHECK: $_ZN1B3fooEv = comdat any // CHECK: $_ZTV1A = comdat any -// CHECK: $_ZTS1A = comdat any // CHECK: $_ZTI1A = comdat any +// CHECK: $_ZTS1A = comdat any // CHECK: $_ZTI1A.rtti_proxy = comdat any // CHECK: $_ZTV1B = comdat any -// CHECK: $_ZTS1B = comdat any // CHECK: $_ZTI1B = comdat any +// CHECK: $_ZTS1B = comdat any // CHECK: $_ZTI1B.rtti_proxy = comdat any // Both the vtables for A and B are emitted and in their own comdats. diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp index 341c53146d476d1a7a35e41a5e6b97592cf403ef..d6eda793cc5b4b5647eebee8d25cbdf86839f0f7 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp @@ -7,17 +7,17 @@ // A::foo() has a comdat since it is an inline function // CHECK: $_ZN1A3fooEv = comdat any // CHECK: $_ZTV1A = comdat any +// CHECK: $_ZTI1A = comdat any // CHECK: $_ZTS1A = comdat any // The VTable for A has its own comdat section bc it has no key function -// CHECK: $_ZTI1A = comdat any // CHECK: $_ZTI1A.rtti_proxy = comdat any // The VTable for A is emitted here and in a comdat section since it has no key function, and is used in this module when creating an instance of A. // CHECK: @_ZTV1A.local = linkonce_odr hidden unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1A.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, comdat($_ZTV1A), align 4 +// CHECK: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, comdat, align 8 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] // CHECK: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00", comdat, align 1 -// CHECK: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, comdat, align 8 // CHECK: @_ZTI1A.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1A, comdat // CHECK: @_ZTV1A = linkonce_odr unnamed_addr alias { [3 x i32] }, ptr @_ZTV1A.local diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp index ad8018ee176712d61692d4fbbdc902af62652803..9dcb1c30e56275634c503f3fe3706945515bcdf0 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp @@ -9,9 +9,9 @@ // The vtable definition itself is private so we can take relative references to // it. The vtable symbol will be exposed through a public alias. // CHECK: @_ZTV1A.local = internal unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1A.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, align 4 +// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] // CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 -// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8 // The rtti should be in a comdat // CHECK: @_ZTI1A.rtti_proxy = {{.*}}comdat diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp index fc5ee5096433ed82b0dc30de05a3f05be93f90e2..c471e5dbd7b33ce82cb27e391af92b6168a67a92 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp @@ -5,12 +5,12 @@ // CHECK: $_ZTI1A.rtti_proxy = comdat any // CHECK: $_ZTI1B.rtti_proxy = comdat any +// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] // CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 -// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8 +// CHECK: @_ZTI1B ={{.*}} constant { ptr, ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 8), ptr @_ZTS1B, ptr @_ZTI1A }, align 8 // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr] // CHECK: @_ZTS1B ={{.*}} constant [3 x i8] c"1B\00", align 1 -// CHECK: @_ZTI1B ={{.*}} constant { ptr, ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 8), ptr @_ZTS1B, ptr @_ZTI1A }, align 8 // CHECK: @_ZTI1A.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1A, comdat // CHECK: @_ZTI1B.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1B, comdat diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp index dfe31ff2ce25fbca4a9dbfa6d6835c19167dce7f..3f2b0622d55162a60c0379c7dda4cdb6a0d7726c 100644 --- a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp +++ b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp @@ -17,6 +17,7 @@ void f(__SVFloat16_t, __SVFloat16_t); void f(__SVFloat32_t, __SVFloat32_t); void f(__SVFloat64_t, __SVFloat64_t); void f(__SVBfloat16_t, __SVBfloat16_t); +void f(__SVMfloat8_t, __SVMfloat8_t); void f(__SVBool_t, __SVBool_t); void f(__SVCount_t, __SVCount_t); @@ -150,6 +151,7 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t); // CHECK-NEXT: call void @_Z1fu13__SVFloat16_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu13__SVFloat32_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu13__SVFloat64_tS_( zeroinitializer, zeroinitializer) +// CHECK-NEXT: call void @_Z1fu13__SVMfloat8_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu14__SVBfloat16_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu10__SVBool_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu11__SVCount_tS_(target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer) @@ -664,6 +666,7 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t); // COMPAT_17-NEXT: call void @_Z1fu13__SVFloat16_tu13__SVFloat16_t( zeroinitializer, zeroinitializer) // COMPAT_17-NEXT: call void @_Z1fu13__SVFloat32_tu13__SVFloat32_t( zeroinitializer, zeroinitializer) // COMPAT_17-NEXT: call void @_Z1fu13__SVFloat64_tu13__SVFloat64_t( zeroinitializer, zeroinitializer) +// COMPAT_17-NEXT: call void @_Z1fu13__SVMfloat8_tu13__SVMfloat8_t( zeroinitializer, zeroinitializer) // COMPAT_17-NEXT: call void @_Z1fu14__SVBFloat16_tu14__SVBFloat16_t( zeroinitializer, zeroinitializer) // COMPAT_17-NEXT: call void @_Z1fu10__SVBool_tu10__SVBool_t( zeroinitializer, zeroinitializer) // COMPAT_17-NEXT: call void @_Z1fu11__SVCount_tu11__SVCount_t(target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer) @@ -1100,6 +1103,7 @@ void foo() { f(__SVFloat16_t(), __SVFloat16_t()); f(__SVFloat32_t(), __SVFloat32_t()); f(__SVFloat64_t(), __SVFloat64_t()); + f(__SVMfloat8_t(), __SVMfloat8_t()); f(__SVBfloat16_t(), __SVBfloat16_t()); f(__SVBool_t(), __SVBool_t()); f(__SVCount_t(), __SVCount_t()); diff --git a/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp b/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp index 7f6b2a9caae6b8e901cacb5ea1c6b6eefa8a9030..beab9f9078a7736fc4baf0e77499b1c45167f049 100644 --- a/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp +++ b/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp @@ -21,6 +21,8 @@ auto &f64 = typeid(__SVFloat64_t); auto &bf16 = typeid(__SVBfloat16_t); +auto &mf8 = typeid(__SVMfloat8_t); + auto &b8 = typeid(__SVBool_t); auto &c8 = typeid(__SVCount_t); @@ -60,6 +62,9 @@ auto &c8 = typeid(__SVCount_t); // CHECK-DAG: @_ZTSu14__SVBfloat16_t = {{.*}} c"u14__SVBfloat16_t\00" // CHECK-DAG: @_ZTIu14__SVBfloat16_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu14__SVBfloat16_t +// CHECK-DAG: @_ZTSu13__SVMfloat8_t = {{.*}} c"u13__SVMfloat8_t\00" +// CHECK-DAG: @_ZTIu13__SVMfloat8_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu13__SVMfloat8_t + // CHECK-DAG: @_ZTSu10__SVBool_t = {{.*}} c"u10__SVBool_t\00" // CHECK-DAG: @_ZTIu10__SVBool_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu10__SVBool_t diff --git a/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp b/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp index 503d77a1822a4bd13979127430f9889a8980f3bc..45cf8081eb3a4d85f7fb0d6ab744ffdfd872d75c 100644 --- a/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp +++ b/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp @@ -12,6 +12,7 @@ // CHECK-NEXT: [[U16:%.*]] = alloca , align 16 // CHECK-NEXT: [[U32:%.*]] = alloca , align 16 // CHECK-NEXT: [[U64:%.*]] = alloca , align 16 +// CHECK-NEXT: [[MF8:%.*]] = alloca , align 16 // CHECK-NEXT: [[F16:%.*]] = alloca , align 16 // CHECK-NEXT: [[F32:%.*]] = alloca , align 16 // CHECK-NEXT: [[F64:%.*]] = alloca , align 16 @@ -64,6 +65,7 @@ // CHECK-NEXT: store zeroinitializer, ptr [[U16]], align 16 // CHECK-NEXT: store zeroinitializer, ptr [[U32]], align 16 // CHECK-NEXT: store zeroinitializer, ptr [[U64]], align 16 +// CHECK-NEXT: store zeroinitializer, ptr [[MF8]], align 16 // CHECK-NEXT: store zeroinitializer, ptr [[F16]], align 16 // CHECK-NEXT: store zeroinitializer, ptr [[F32]], align 16 // CHECK-NEXT: store zeroinitializer, ptr [[F64]], align 16 @@ -119,6 +121,7 @@ void test_locals(void) { __SVUint16_t u16{}; __SVUint32_t u32{}; __SVUint64_t u64{}; + __SVMfloat8_t mf8{}; __SVFloat16_t f16{}; __SVFloat32_t f32{}; __SVFloat64_t f64{}; @@ -282,6 +285,20 @@ void test_copy_u64(__SVUint64_t a) { __SVUint64_t b{a}; } +// CHECK-LABEL: define dso_local void @_Z13test_copy_mf8u13__SVMfloat8_t +// CHECK-SAME: ( [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca , align 16 +// CHECK-NEXT: [[B:%.*]] = alloca , align 16 +// CHECK-NEXT: store [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store [[TMP0]], ptr [[B]], align 16 +// CHECK-NEXT: ret void +// +void test_copy_mf8(__SVMfloat8_t a) { + __SVMfloat8_t b{a}; +} + // CHECK-LABEL: define dso_local void @_Z13test_copy_f16u13__SVFloat16_t // CHECK-SAME: ( [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: diff --git a/clang/test/CodeGenCXX/armv7k.cpp b/clang/test/CodeGenCXX/armv7k.cpp index a4a243c162ea3f3bd6967b34ad43994e09fc7dd0..7aa9fd7944cfdff3157bedc7ce24254fced6af6d 100644 --- a/clang/test/CodeGenCXX/armv7k.cpp +++ b/clang/test/CodeGenCXX/armv7k.cpp @@ -50,17 +50,17 @@ namespace test2 { struct __attribute__((visibility("hidden"))) B {}; const std::type_info &b0 = typeid(B); - // CHECK-GLOBALS: @_ZTSN5test21BE = linkonce_odr hidden constant // CHECK-GLOBALS: @_ZTIN5test21BE = linkonce_odr hidden constant { {{.*}}, ptr @_ZTSN5test21BE } + // CHECK-GLOBALS: @_ZTSN5test21BE = linkonce_odr hidden constant const std::type_info &b1 = typeid(B*); - // CHECK-GLOBALS: @_ZTSPN5test21BE = linkonce_odr hidden constant // CHECK-GLOBALS: @_ZTIPN5test21BE = linkonce_odr hidden constant { {{.*}}, ptr @_ZTSPN5test21BE, i32 0, ptr @_ZTIN5test21BE + // CHECK-GLOBALS: @_ZTSPN5test21BE = linkonce_odr hidden constant struct C {}; const std::type_info &c0 = typeid(C); - // CHECK-GLOBALS: @_ZTSN5test21CE = linkonce_odr constant [11 x i8] c"N5test21CE\00" // CHECK-GLOBALS: @_ZTIN5test21CE = linkonce_odr constant { {{.*}}, ptr @_ZTSN5test21CE } + // CHECK-GLOBALS: @_ZTSN5test21CE = linkonce_odr constant [11 x i8] c"N5test21CE\00" } // va_list should be based on "char *" rather than "ptr". diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp index d0c87d9dfda5f779aa7e079488f09a8a3b7c33f0..271d9ede79d0c4fab00824c4c001088ee51fef3d 100644 --- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp +++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp @@ -10,17 +10,17 @@ B fail; // CHECK: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8 // CHECK: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8 // CHECK: @_ZTI1A = external addrspace(1) constant ptr addrspace(1) +// CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8 // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)] // CHECK: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1 -// CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8 // CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 //. // WITH-NONZERO-DEFAULT-AS: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8 // WITH-NONZERO-DEFAULT-AS: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8 // WITH-NONZERO-DEFAULT-AS: @_ZTI1A = external addrspace(1) constant ptr addrspace(1) +// WITH-NONZERO-DEFAULT-AS: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8 // WITH-NONZERO-DEFAULT-AS: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)] // WITH-NONZERO-DEFAULT-AS: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1 -// WITH-NONZERO-DEFAULT-AS: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8 //. // CHECK-LABEL: define dso_local noundef nonnull align 8 dereferenceable(8) ptr @_Z1fP1A( // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { diff --git a/clang/test/CodeGenCXX/exceptions-no-rtti.cpp b/clang/test/CodeGenCXX/exceptions-no-rtti.cpp index 7c73285b948f16132df14867a800072d6ddc60ba..a3d969665bdc71fe9a2fa43e0610fd8597b91418 100644 --- a/clang/test/CodeGenCXX/exceptions-no-rtti.cpp +++ b/clang/test/CodeGenCXX/exceptions-no-rtti.cpp @@ -3,8 +3,8 @@ // CHECK: @_ZTIN5test11AE = linkonce_odr constant // CHECK: @_ZTIN5test11BE = linkonce_odr constant // CHECK: @_ZTIN5test11CE = linkonce_odr constant -// CHECK: @_ZTIN5test11DE = linkonce_odr constant // CHECK: @_ZTIPN5test11DE = linkonce_odr constant {{.*}} @_ZTIN5test11DE +// CHECK: @_ZTIN5test11DE = linkonce_odr constant // PR6974: this shouldn't crash namespace test0 { diff --git a/clang/test/CodeGenCXX/implicit-record-visibility.cpp b/clang/test/CodeGenCXX/implicit-record-visibility.cpp index ef388c7b8316a088c7d2cb99b542aae05ec6d494..84ad822702d39bee5c1b41ae55434fb2cb9d7b4f 100644 --- a/clang/test/CodeGenCXX/implicit-record-visibility.cpp +++ b/clang/test/CodeGenCXX/implicit-record-visibility.cpp @@ -7,6 +7,6 @@ // under -fvisibility=hidden the type of function f, due to its va_list (aka // __builtin_va_list, aka __va_list_tag (*)[1]) parameter would be hidden: -// CHECK: @_ZTSFvP13__va_list_tagE = linkonce_odr constant // CHECK: @_ZTIFvP13__va_list_tagE = linkonce_odr constant +// CHECK: @_ZTSFvP13__va_list_tagE = linkonce_odr constant void f(va_list) { (void)typeid(f); } diff --git a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp index 1af105e915e6363b87c62a86fe9c8482667f8819..2fc0a6a4ee608e19f0d933ae1b433adc6d52704c 100644 --- a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp +++ b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp @@ -16,20 +16,20 @@ // C is an incomplete class type, so any direct or indirect pointer types should have // internal linkage, as should the type info for C itself. struct C; +// CHECK: @_ZTIP1C = internal constant // CHECK: @_ZTSP1C = internal constant -// CHECK: @_ZTS1C = internal constant // CHECK: @_ZTI1C = internal constant -// CHECK: @_ZTIP1C = internal constant -// CHECK: @_ZTSPP1C = internal constant +// CHECK: @_ZTS1C = internal constant // CHECK: @_ZTIPP1C = internal constant +// CHECK: @_ZTSPP1C = internal constant struct __attribute__((type_visibility("default"))) D; +// CHECK: @_ZTIP1D = internal constant // CHECK: @_ZTSP1D = internal constant -// CHECK: @_ZTS1D = internal constant // CHECK: @_ZTI1D = internal constant -// CHECK: @_ZTIP1D = internal constant -// CHECK: @_ZTSPP1D = internal constant +// CHECK: @_ZTS1D = internal constant // CHECK: @_ZTIPP1D = internal constant +// CHECK: @_ZTSPP1D = internal constant void __attribute__((visibility("default"))) tfunc() { (void)typeid(C *); @@ -46,12 +46,12 @@ void s::foo() {} // UNSPECIFIED-DEF: @_ZTV1s = unnamed_addr constant // UNSPECIFIED-HID: @_ZTV1s = hidden unnamed_addr constant // UNSPECIFIED-EXP: @_ZTV1s = dllexport unnamed_addr constant -// UNSPECIFIED-DEF: @_ZTS1s = constant -// UNSPECIFIED-HID: @_ZTS1s = hidden constant -// UNSPECIFIED-EXP: @_ZTS1s = dllexport constant // UNSPECIFIED-DEF: @_ZTI1s = constant // UNSPECIFIED-HID: @_ZTI1s = hidden constant // UNSPECIFIED-EXP: @_ZTI1s = dllexport constant +// UNSPECIFIED-DEF: @_ZTS1s = constant +// UNSPECIFIED-HID: @_ZTS1s = hidden constant +// UNSPECIFIED-EXP: @_ZTS1s = dllexport constant // explicit default visibility RTTI & vtable struct __attribute__((type_visibility("default"))) t { @@ -61,12 +61,12 @@ void t::foo() {} // EXPLICIT-DEF: @_ZTV1t = unnamed_addr constant // EXPLICIT-HID: @_ZTV1t = hidden unnamed_addr constant // EXPLICIT-EXP: @_ZTV1t = dllexport unnamed_addr constant -// EXPLICIT-DEF: @_ZTS1t = constant -// EXPLICIT-HID: @_ZTS1t = hidden constant -// EXPLICIT-EXP: @_ZTS1t = dllexport constant // EXPLICIT-DEF: @_ZTI1t = constant // EXPLICIT-HID: @_ZTI1t = hidden constant // EXPLICIT-EXP: @_ZTI1t = dllexport constant +// EXPLICIT-DEF: @_ZTS1t = constant +// EXPLICIT-HID: @_ZTS1t = hidden constant +// EXPLICIT-EXP: @_ZTS1t = dllexport constant #ifdef FUNDAMENTAL_IS_EXPLICIT #define TYPE_VIS __attribute__((type_visibility("default"))) @@ -86,511 +86,511 @@ __fundamental_type_info::~__fundamental_type_info() {} // __cxxabiv1::__fundamental_type_info // FUND-DEF: @_ZTVN10__cxxabiv123__fundamental_type_infoE = unnamed_addr constant -// FUND-DEF: @_ZTSN10__cxxabiv123__fundamental_type_infoE = constant // FUND-DEF: @_ZTIN10__cxxabiv123__fundamental_type_infoE = constant +// FUND-DEF: @_ZTSN10__cxxabiv123__fundamental_type_infoE = constant // FUND-HID: @_ZTVN10__cxxabiv123__fundamental_type_infoE = hidden unnamed_addr constant -// FUND-HID: @_ZTSN10__cxxabiv123__fundamental_type_infoE = hidden constant // FUND-HID: @_ZTIN10__cxxabiv123__fundamental_type_infoE = hidden constant +// FUND-HID: @_ZTSN10__cxxabiv123__fundamental_type_infoE = hidden constant // FUND-EXP: @_ZTVN10__cxxabiv123__fundamental_type_infoE = dllexport unnamed_addr constant -// FUND-EXP: @_ZTSN10__cxxabiv123__fundamental_type_infoE = dllexport constant // FUND-EXP: @_ZTIN10__cxxabiv123__fundamental_type_infoE = dllexport constant +// FUND-EXP: @_ZTSN10__cxxabiv123__fundamental_type_infoE = dllexport constant // void -// FUND-DEF: @_ZTSv = constant // FUND-DEF: @_ZTIv = constant -// FUND-DEF: @_ZTSPv = constant +// FUND-DEF: @_ZTSv = constant // FUND-DEF: @_ZTIPv = constant -// FUND-DEF: @_ZTSPKv = constant +// FUND-DEF: @_ZTSPv = constant // FUND-DEF: @_ZTIPKv = constant -// FUND-HID: @_ZTSv = hidden constant +// FUND-DEF: @_ZTSPKv = constant // FUND-HID: @_ZTIv = hidden constant -// FUND-HID: @_ZTSPv = hidden constant +// FUND-HID: @_ZTSv = hidden constant // FUND-HID: @_ZTIPv = hidden constant -// FUND-HID: @_ZTSPKv = hidden constant +// FUND-HID: @_ZTSPv = hidden constant // FUND-HID: @_ZTIPKv = hidden constant -// FUND-EXP: @_ZTSv = dllexport constant +// FUND-HID: @_ZTSPKv = hidden constant // FUND-EXP: @_ZTIv = dllexport constant -// FUND-EXP: @_ZTSPv = dllexport constant +// FUND-EXP: @_ZTSv = dllexport constant // FUND-EXP: @_ZTIPv = dllexport constant -// FUND-EXP: @_ZTSPKv = dllexport constant +// FUND-EXP: @_ZTSPv = dllexport constant // FUND-EXP: @_ZTIPKv = dllexport constant +// FUND-EXP: @_ZTSPKv = dllexport constant // std::nullptr_t -// FUND-DEF: @_ZTSDn = constant // FUND-DEF: @_ZTIDn = constant -// FUND-DEF: @_ZTSPDn = constant +// FUND-DEF: @_ZTSDn = constant // FUND-DEF: @_ZTIPDn = constant -// FUND-DEF: @_ZTSPKDn = constant +// FUND-DEF: @_ZTSPDn = constant // FUND-DEF: @_ZTIPKDn = constant -// FUND-HID: @_ZTSDn = hidden constant +// FUND-DEF: @_ZTSPKDn = constant // FUND-HID: @_ZTIDn = hidden constant -// FUND-HID: @_ZTSPDn = hidden constant +// FUND-HID: @_ZTSDn = hidden constant // FUND-HID: @_ZTIPDn = hidden constant -// FUND-HID: @_ZTSPKDn = hidden constant +// FUND-HID: @_ZTSPDn = hidden constant // FUND-HID: @_ZTIPKDn = hidden constant -// FUND-EXP: @_ZTSDn = dllexport constant +// FUND-HID: @_ZTSPKDn = hidden constant // FUND-EXP: @_ZTIDn = dllexport constant -// FUND-EXP: @_ZTSPDn = dllexport constant +// FUND-EXP: @_ZTSDn = dllexport constant // FUND-EXP: @_ZTIPDn = dllexport constant -// FUND-EXP: @_ZTSPKDn = dllexport constant +// FUND-EXP: @_ZTSPDn = dllexport constant // FUND-EXP: @_ZTIPKDn = dllexport constant +// FUND-EXP: @_ZTSPKDn = dllexport constant // bool -// FUND-DEF: @_ZTSb = constant // FUND-DEF: @_ZTIb = constant -// FUND-DEF: @_ZTSPb = constant +// FUND-DEF: @_ZTSb = constant // FUND-DEF: @_ZTIPb = constant -// FUND-DEF: @_ZTSPKb = constant +// FUND-DEF: @_ZTSPb = constant // FUND-DEF: @_ZTIPKb = constant -// FUND-HID: @_ZTSb = hidden constant +// FUND-DEF: @_ZTSPKb = constant // FUND-HID: @_ZTIb = hidden constant -// FUND-HID: @_ZTSPb = hidden constant +// FUND-HID: @_ZTSb = hidden constant // FUND-HID: @_ZTIPb = hidden constant -// FUND-HID: @_ZTSPKb = hidden constant +// FUND-HID: @_ZTSPb = hidden constant // FUND-HID: @_ZTIPKb = hidden constant -// FUND-EXP: @_ZTSb = dllexport constant +// FUND-HID: @_ZTSPKb = hidden constant // FUND-EXP: @_ZTIb = dllexport constant -// FUND-EXP: @_ZTSPb = dllexport constant +// FUND-EXP: @_ZTSb = dllexport constant // FUND-EXP: @_ZTIPb = dllexport constant -// FUND-EXP: @_ZTSPKb = dllexport constant +// FUND-EXP: @_ZTSPb = dllexport constant // FUND-EXP: @_ZTIPKb = dllexport constant +// FUND-EXP: @_ZTSPKb = dllexport constant // wchar_t -// FUND-DEF: @_ZTSw = constant // FUND-DEF: @_ZTIw = constant -// FUND-DEF: @_ZTSPw = constant +// FUND-DEF: @_ZTSw = constant // FUND-DEF: @_ZTIPw = constant -// FUND-DEF: @_ZTSPKw = constant +// FUND-DEF: @_ZTSPw = constant // FUND-DEF: @_ZTIPKw = constant -// FUND-HID: @_ZTSw = hidden constant +// FUND-DEF: @_ZTSPKw = constant // FUND-HID: @_ZTIw = hidden constant -// FUND-HID: @_ZTSPw = hidden constant +// FUND-HID: @_ZTSw = hidden constant // FUND-HID: @_ZTIPw = hidden constant -// FUND-HID: @_ZTSPKw = hidden constant +// FUND-HID: @_ZTSPw = hidden constant // FUND-HID: @_ZTIPKw = hidden constant -// FUND-EXP: @_ZTSw = dllexport constant +// FUND-HID: @_ZTSPKw = hidden constant // FUND-EXP: @_ZTIw = dllexport constant -// FUND-EXP: @_ZTSPw = dllexport constant +// FUND-EXP: @_ZTSw = dllexport constant // FUND-EXP: @_ZTIPw = dllexport constant -// FUND-EXP: @_ZTSPKw = dllexport constant +// FUND-EXP: @_ZTSPw = dllexport constant // FUND-EXP: @_ZTIPKw = dllexport constant +// FUND-EXP: @_ZTSPKw = dllexport constant // char -// FUND-DEF: @_ZTSc = constant // FUND-DEF: @_ZTIc = constant -// FUND-DEF: @_ZTSPc = constant +// FUND-DEF: @_ZTSc = constant // FUND-DEF: @_ZTIPc = constant -// FUND-DEF: @_ZTSPKc = constant +// FUND-DEF: @_ZTSPc = constant // FUND-DEF: @_ZTIPKc = constant -// FUND-HID: @_ZTSc = hidden constant +// FUND-DEF: @_ZTSPKc = constant // FUND-HID: @_ZTIc = hidden constant -// FUND-HID: @_ZTSPc = hidden constant +// FUND-HID: @_ZTSc = hidden constant // FUND-HID: @_ZTIPc = hidden constant -// FUND-HID: @_ZTSPKc = hidden constant +// FUND-HID: @_ZTSPc = hidden constant // FUND-HID: @_ZTIPKc = hidden constant -// FUND-EXP: @_ZTSc = dllexport constant +// FUND-HID: @_ZTSPKc = hidden constant // FUND-EXP: @_ZTIc = dllexport constant -// FUND-EXP: @_ZTSPc = dllexport constant +// FUND-EXP: @_ZTSc = dllexport constant // FUND-EXP: @_ZTIPc = dllexport constant -// FUND-EXP: @_ZTSPKc = dllexport constant +// FUND-EXP: @_ZTSPc = dllexport constant // FUND-EXP: @_ZTIPKc = dllexport constant +// FUND-EXP: @_ZTSPKc = dllexport constant // unsigned char -// FUND-DEF: @_ZTSh = constant // FUND-DEF: @_ZTIh = constant -// FUND-DEF: @_ZTSPh = constant +// FUND-DEF: @_ZTSh = constant // FUND-DEF: @_ZTIPh = constant -// FUND-DEF: @_ZTSPKh = constant +// FUND-DEF: @_ZTSPh = constant // FUND-DEF: @_ZTIPKh = constant -// FUND-HID: @_ZTSh = hidden constant +// FUND-DEF: @_ZTSPKh = constant // FUND-HID: @_ZTIh = hidden constant -// FUND-HID: @_ZTSPh = hidden constant +// FUND-HID: @_ZTSh = hidden constant // FUND-HID: @_ZTIPh = hidden constant -// FUND-HID: @_ZTSPKh = hidden constant +// FUND-HID: @_ZTSPh = hidden constant // FUND-HID: @_ZTIPKh = hidden constant -// FUND-EXP: @_ZTSh = dllexport constant +// FUND-HID: @_ZTSPKh = hidden constant // FUND-EXP: @_ZTIh = dllexport constant -// FUND-EXP: @_ZTSPh = dllexport constant +// FUND-EXP: @_ZTSh = dllexport constant // FUND-EXP: @_ZTIPh = dllexport constant -// FUND-EXP: @_ZTSPKh = dllexport constant +// FUND-EXP: @_ZTSPh = dllexport constant // FUND-EXP: @_ZTIPKh = dllexport constant +// FUND-EXP: @_ZTSPKh = dllexport constant // signed char -// FUND-DEF: @_ZTSa = constant // FUND-DEF: @_ZTIa = constant -// FUND-DEF: @_ZTSPa = constant +// FUND-DEF: @_ZTSa = constant // FUND-DEF: @_ZTIPa = constant -// FUND-DEF: @_ZTSPKa = constant +// FUND-DEF: @_ZTSPa = constant // FUND-DEF: @_ZTIPKa = constant -// FUND-HID: @_ZTSa = hidden constant +// FUND-DEF: @_ZTSPKa = constant // FUND-HID: @_ZTIa = hidden constant -// FUND-HID: @_ZTSPa = hidden constant +// FUND-HID: @_ZTSa = hidden constant // FUND-HID: @_ZTIPa = hidden constant -// FUND-HID: @_ZTSPKa = hidden constant +// FUND-HID: @_ZTSPa = hidden constant // FUND-HID: @_ZTIPKa = hidden constant -// FUND-EXP: @_ZTSa = dllexport constant +// FUND-HID: @_ZTSPKa = hidden constant // FUND-EXP: @_ZTIa = dllexport constant -// FUND-EXP: @_ZTSPa = dllexport constant +// FUND-EXP: @_ZTSa = dllexport constant // FUND-EXP: @_ZTIPa = dllexport constant -// FUND-EXP: @_ZTSPKa = dllexport constant +// FUND-EXP: @_ZTSPa = dllexport constant // FUND-EXP: @_ZTIPKa = dllexport constant +// FUND-EXP: @_ZTSPKa = dllexport constant // short -// FUND-DEF: @_ZTSs = constant // FUND-DEF: @_ZTIs = constant -// FUND-DEF: @_ZTSPs = constant +// FUND-DEF: @_ZTSs = constant // FUND-DEF: @_ZTIPs = constant -// FUND-DEF: @_ZTSPKs = constant +// FUND-DEF: @_ZTSPs = constant // FUND-DEF: @_ZTIPKs = constant -// FUND-HID: @_ZTSs = hidden constant +// FUND-DEF: @_ZTSPKs = constant // FUND-HID: @_ZTIs = hidden constant -// FUND-HID: @_ZTSPs = hidden constant +// FUND-HID: @_ZTSs = hidden constant // FUND-HID: @_ZTIPs = hidden constant -// FUND-HID: @_ZTSPKs = hidden constant +// FUND-HID: @_ZTSPs = hidden constant // FUND-HID: @_ZTIPKs = hidden constant -// FUND-EXP: @_ZTSs = dllexport constant +// FUND-HID: @_ZTSPKs = hidden constant // FUND-EXP: @_ZTIs = dllexport constant -// FUND-EXP: @_ZTSPs = dllexport constant +// FUND-EXP: @_ZTSs = dllexport constant // FUND-EXP: @_ZTIPs = dllexport constant -// FUND-EXP: @_ZTSPKs = dllexport constant +// FUND-EXP: @_ZTSPs = dllexport constant // FUND-EXP: @_ZTIPKs = dllexport constant +// FUND-EXP: @_ZTSPKs = dllexport constant // unsigned short -// FUND-DEF: @_ZTSt = constant // FUND-DEF: @_ZTIt = constant -// FUND-DEF: @_ZTSPt = constant +// FUND-DEF: @_ZTSt = constant // FUND-DEF: @_ZTIPt = constant -// FUND-DEF: @_ZTSPKt = constant +// FUND-DEF: @_ZTSPt = constant // FUND-DEF: @_ZTIPKt = constant -// FUND-HID: @_ZTSt = hidden constant +// FUND-DEF: @_ZTSPKt = constant // FUND-HID: @_ZTIt = hidden constant -// FUND-HID: @_ZTSPt = hidden constant +// FUND-HID: @_ZTSt = hidden constant // FUND-HID: @_ZTIPt = hidden constant -// FUND-HID: @_ZTSPKt = hidden constant +// FUND-HID: @_ZTSPt = hidden constant // FUND-HID: @_ZTIPKt = hidden constant -// FUND-EXP: @_ZTSt = dllexport constant +// FUND-HID: @_ZTSPKt = hidden constant // FUND-EXP: @_ZTIt = dllexport constant -// FUND-EXP: @_ZTSPt = dllexport constant +// FUND-EXP: @_ZTSt = dllexport constant // FUND-EXP: @_ZTIPt = dllexport constant -// FUND-EXP: @_ZTSPKt = dllexport constant +// FUND-EXP: @_ZTSPt = dllexport constant // FUND-EXP: @_ZTIPKt = dllexport constant +// FUND-EXP: @_ZTSPKt = dllexport constant // int -// FUND-DEF: @_ZTSi = constant // FUND-DEF: @_ZTIi = constant -// FUND-DEF: @_ZTSPi = constant +// FUND-DEF: @_ZTSi = constant // FUND-DEF: @_ZTIPi = constant -// FUND-DEF: @_ZTSPKi = constant +// FUND-DEF: @_ZTSPi = constant // FUND-DEF: @_ZTIPKi = constant -// FUND-HID: @_ZTSi = hidden constant +// FUND-DEF: @_ZTSPKi = constant // FUND-HID: @_ZTIi = hidden constant -// FUND-HID: @_ZTSPi = hidden constant +// FUND-HID: @_ZTSi = hidden constant // FUND-HID: @_ZTIPi = hidden constant -// FUND-HID: @_ZTSPKi = hidden constant +// FUND-HID: @_ZTSPi = hidden constant // FUND-HID: @_ZTIPKi = hidden constant -// FUND-EXP: @_ZTSi = dllexport constant +// FUND-HID: @_ZTSPKi = hidden constant // FUND-EXP: @_ZTIi = dllexport constant -// FUND-EXP: @_ZTSPi = dllexport constant +// FUND-EXP: @_ZTSi = dllexport constant // FUND-EXP: @_ZTIPi = dllexport constant -// FUND-EXP: @_ZTSPKi = dllexport constant +// FUND-EXP: @_ZTSPi = dllexport constant // FUND-EXP: @_ZTIPKi = dllexport constant +// FUND-EXP: @_ZTSPKi = dllexport constant // unsigned int -// FUND-DEF: @_ZTSj = constant // FUND-DEF: @_ZTIj = constant -// FUND-DEF: @_ZTSPj = constant +// FUND-DEF: @_ZTSj = constant // FUND-DEF: @_ZTIPj = constant -// FUND-DEF: @_ZTSPKj = constant +// FUND-DEF: @_ZTSPj = constant // FUND-DEF: @_ZTIPKj = constant -// FUND-HID: @_ZTSj = hidden constant +// FUND-DEF: @_ZTSPKj = constant // FUND-HID: @_ZTIj = hidden constant -// FUND-HID: @_ZTSPj = hidden constant +// FUND-HID: @_ZTSj = hidden constant // FUND-HID: @_ZTIPj = hidden constant -// FUND-HID: @_ZTSPKj = hidden constant +// FUND-HID: @_ZTSPj = hidden constant // FUND-HID: @_ZTIPKj = hidden constant -// FUND-EXP: @_ZTSj = dllexport constant +// FUND-HID: @_ZTSPKj = hidden constant // FUND-EXP: @_ZTIj = dllexport constant -// FUND-EXP: @_ZTSPj = dllexport constant +// FUND-EXP: @_ZTSj = dllexport constant // FUND-EXP: @_ZTIPj = dllexport constant -// FUND-EXP: @_ZTSPKj = dllexport constant +// FUND-EXP: @_ZTSPj = dllexport constant // FUND-EXP: @_ZTIPKj = dllexport constant +// FUND-EXP: @_ZTSPKj = dllexport constant // long -// FUND-DEF: @_ZTSl = constant // FUND-DEF: @_ZTIl = constant -// FUND-DEF: @_ZTSPl = constant +// FUND-DEF: @_ZTSl = constant // FUND-DEF: @_ZTIPl = constant -// FUND-DEF: @_ZTSPKl = constant +// FUND-DEF: @_ZTSPl = constant // FUND-DEF: @_ZTIPKl = constant -// FUND-HID: @_ZTSl = hidden constant +// FUND-DEF: @_ZTSPKl = constant // FUND-HID: @_ZTIl = hidden constant -// FUND-HID: @_ZTSPl = hidden constant +// FUND-HID: @_ZTSl = hidden constant // FUND-HID: @_ZTIPl = hidden constant -// FUND-HID: @_ZTSPKl = hidden constant +// FUND-HID: @_ZTSPl = hidden constant // FUND-HID: @_ZTIPKl = hidden constant -// FUND-EXP: @_ZTSl = dllexport constant +// FUND-HID: @_ZTSPKl = hidden constant // FUND-EXP: @_ZTIl = dllexport constant -// FUND-EXP: @_ZTSPl = dllexport constant +// FUND-EXP: @_ZTSl = dllexport constant // FUND-EXP: @_ZTIPl = dllexport constant -// FUND-EXP: @_ZTSPKl = dllexport constant +// FUND-EXP: @_ZTSPl = dllexport constant // FUND-EXP: @_ZTIPKl = dllexport constant +// FUND-EXP: @_ZTSPKl = dllexport constant // unsigned long -// FUND-DEF: @_ZTSm = constant // FUND-DEF: @_ZTIm = constant -// FUND-DEF: @_ZTSPm = constant +// FUND-DEF: @_ZTSm = constant // FUND-DEF: @_ZTIPm = constant -// FUND-DEF: @_ZTSPKm = constant +// FUND-DEF: @_ZTSPm = constant // FUND-DEF: @_ZTIPKm = constant -// FUND-HID: @_ZTSm = hidden constant +// FUND-DEF: @_ZTSPKm = constant // FUND-HID: @_ZTIm = hidden constant -// FUND-HID: @_ZTSPm = hidden constant +// FUND-HID: @_ZTSm = hidden constant // FUND-HID: @_ZTIPm = hidden constant -// FUND-HID: @_ZTSPKm = hidden constant +// FUND-HID: @_ZTSPm = hidden constant // FUND-HID: @_ZTIPKm = hidden constant -// FUND-EXP: @_ZTSm = dllexport constant +// FUND-HID: @_ZTSPKm = hidden constant // FUND-EXP: @_ZTIm = dllexport constant -// FUND-EXP: @_ZTSPm = dllexport constant +// FUND-EXP: @_ZTSm = dllexport constant // FUND-EXP: @_ZTIPm = dllexport constant -// FUND-EXP: @_ZTSPKm = dllexport constant +// FUND-EXP: @_ZTSPm = dllexport constant // FUND-EXP: @_ZTIPKm = dllexport constant +// FUND-EXP: @_ZTSPKm = dllexport constant // long long -// FUND-DEF: @_ZTSx = constant // FUND-DEF: @_ZTIx = constant -// FUND-DEF: @_ZTSPx = constant +// FUND-DEF: @_ZTSx = constant // FUND-DEF: @_ZTIPx = constant -// FUND-DEF: @_ZTSPKx = constant +// FUND-DEF: @_ZTSPx = constant // FUND-DEF: @_ZTIPKx = constant -// FUND-HID: @_ZTSx = hidden constant +// FUND-DEF: @_ZTSPKx = constant // FUND-HID: @_ZTIx = hidden constant -// FUND-HID: @_ZTSPx = hidden constant +// FUND-HID: @_ZTSx = hidden constant // FUND-HID: @_ZTIPx = hidden constant -// FUND-HID: @_ZTSPKx = hidden constant +// FUND-HID: @_ZTSPx = hidden constant // FUND-HID: @_ZTIPKx = hidden constant -// FUND-EXP: @_ZTSx = dllexport constant +// FUND-HID: @_ZTSPKx = hidden constant // FUND-EXP: @_ZTIx = dllexport constant -// FUND-EXP: @_ZTSPx = dllexport constant +// FUND-EXP: @_ZTSx = dllexport constant // FUND-EXP: @_ZTIPx = dllexport constant -// FUND-EXP: @_ZTSPKx = dllexport constant +// FUND-EXP: @_ZTSPx = dllexport constant // FUND-EXP: @_ZTIPKx = dllexport constant +// FUND-EXP: @_ZTSPKx = dllexport constant // unsigned long long -// FUND-DEF: @_ZTSy = constant // FUND-DEF: @_ZTIy = constant -// FUND-DEF: @_ZTSPy = constant +// FUND-DEF: @_ZTSy = constant // FUND-DEF: @_ZTIPy = constant -// FUND-DEF: @_ZTSPKy = constant +// FUND-DEF: @_ZTSPy = constant // FUND-DEF: @_ZTIPKy = constant -// FUND-HID: @_ZTSy = hidden constant +// FUND-DEF: @_ZTSPKy = constant // FUND-HID: @_ZTIy = hidden constant -// FUND-HID: @_ZTSPy = hidden constant +// FUND-HID: @_ZTSy = hidden constant // FUND-HID: @_ZTIPy = hidden constant -// FUND-HID: @_ZTSPKy = hidden constant +// FUND-HID: @_ZTSPy = hidden constant // FUND-HID: @_ZTIPKy = hidden constant -// FUND-EXP: @_ZTSy = dllexport constant +// FUND-HID: @_ZTSPKy = hidden constant // FUND-EXP: @_ZTIy = dllexport constant -// FUND-EXP: @_ZTSPy = dllexport constant +// FUND-EXP: @_ZTSy = dllexport constant // FUND-EXP: @_ZTIPy = dllexport constant -// FUND-EXP: @_ZTSPKy = dllexport constant +// FUND-EXP: @_ZTSPy = dllexport constant // FUND-EXP: @_ZTIPKy = dllexport constant +// FUND-EXP: @_ZTSPKy = dllexport constant // __int128 -// FUND-DEF: @_ZTSn = constant // FUND-DEF: @_ZTIn = constant -// FUND-DEF: @_ZTSPn = constant +// FUND-DEF: @_ZTSn = constant // FUND-DEF: @_ZTIPn = constant -// FUND-DEF: @_ZTSPKn = constant +// FUND-DEF: @_ZTSPn = constant // FUND-DEF: @_ZTIPKn = constant -// FUND-HID: @_ZTSn = hidden constant +// FUND-DEF: @_ZTSPKn = constant // FUND-HID: @_ZTIn = hidden constant -// FUND-HID: @_ZTSPn = hidden constant +// FUND-HID: @_ZTSn = hidden constant // FUND-HID: @_ZTIPn = hidden constant -// FUND-HID: @_ZTSPKn = hidden constant +// FUND-HID: @_ZTSPn = hidden constant // FUND-HID: @_ZTIPKn = hidden constant -// FUND-EXP: @_ZTSn = dllexport constant +// FUND-HID: @_ZTSPKn = hidden constant // FUND-EXP: @_ZTIn = dllexport constant -// FUND-EXP: @_ZTSPn = dllexport constant +// FUND-EXP: @_ZTSn = dllexport constant // FUND-EXP: @_ZTIPn = dllexport constant -// FUND-EXP: @_ZTSPKn = dllexport constant +// FUND-EXP: @_ZTSPn = dllexport constant // FUND-EXP: @_ZTIPKn = dllexport constant +// FUND-EXP: @_ZTSPKn = dllexport constant // unsigned __int128 -// FUND-DEF: @_ZTSo = constant // FUND-DEF: @_ZTIo = constant -// FUND-DEF: @_ZTSPo = constant +// FUND-DEF: @_ZTSo = constant // FUND-DEF: @_ZTIPo = constant -// FUND-DEF: @_ZTSPKo = constant +// FUND-DEF: @_ZTSPo = constant // FUND-DEF: @_ZTIPKo = constant -// FUND-HID: @_ZTSo = hidden constant +// FUND-DEF: @_ZTSPKo = constant // FUND-HID: @_ZTIo = hidden constant -// FUND-HID: @_ZTSPo = hidden constant +// FUND-HID: @_ZTSo = hidden constant // FUND-HID: @_ZTIPo = hidden constant -// FUND-HID: @_ZTSPKo = hidden constant +// FUND-HID: @_ZTSPo = hidden constant // FUND-HID: @_ZTIPKo = hidden constant -// FUND-EXP: @_ZTSo = dllexport constant +// FUND-HID: @_ZTSPKo = hidden constant // FUND-EXP: @_ZTIo = dllexport constant -// FUND-EXP: @_ZTSPo = dllexport constant +// FUND-EXP: @_ZTSo = dllexport constant // FUND-EXP: @_ZTIPo = dllexport constant -// FUND-EXP: @_ZTSPKo = dllexport constant +// FUND-EXP: @_ZTSPo = dllexport constant // FUND-EXP: @_ZTIPKo = dllexport constant +// FUND-EXP: @_ZTSPKo = dllexport constant // half -// FUND-DEF: @_ZTSDh = constant // FUND-DEF: @_ZTIDh = constant -// FUND-DEF: @_ZTSPDh = constant +// FUND-DEF: @_ZTSDh = constant // FUND-DEF: @_ZTIPDh = constant -// FUND-DEF: @_ZTSPKDh = constant +// FUND-DEF: @_ZTSPDh = constant // FUND-DEF: @_ZTIPKDh = constant -// FUND-HID: @_ZTSDh = hidden constant +// FUND-DEF: @_ZTSPKDh = constant // FUND-HID: @_ZTIDh = hidden constant -// FUND-HID: @_ZTSPDh = hidden constant +// FUND-HID: @_ZTSDh = hidden constant // FUND-HID: @_ZTIPDh = hidden constant -// FUND-HID: @_ZTSPKDh = hidden constant +// FUND-HID: @_ZTSPDh = hidden constant // FUND-HID: @_ZTIPKDh = hidden constant -// FUND-EXP: @_ZTSDh = dllexport constant +// FUND-HID: @_ZTSPKDh = hidden constant // FUND-EXP: @_ZTIDh = dllexport constant -// FUND-EXP: @_ZTSPDh = dllexport constant +// FUND-EXP: @_ZTSDh = dllexport constant // FUND-EXP: @_ZTIPDh = dllexport constant -// FUND-EXP: @_ZTSPKDh = dllexport constant +// FUND-EXP: @_ZTSPDh = dllexport constant // FUND-EXP: @_ZTIPKDh = dllexport constant +// FUND-EXP: @_ZTSPKDh = dllexport constant // float -// FUND-DEF: @_ZTSf = constant // FUND-DEF: @_ZTIf = constant -// FUND-DEF: @_ZTSPf = constant +// FUND-DEF: @_ZTSf = constant // FUND-DEF: @_ZTIPf = constant -// FUND-DEF: @_ZTSPKf = constant +// FUND-DEF: @_ZTSPf = constant // FUND-DEF: @_ZTIPKf = constant -// FUND-HID: @_ZTSf = hidden constant +// FUND-DEF: @_ZTSPKf = constant // FUND-HID: @_ZTIf = hidden constant -// FUND-HID: @_ZTSPf = hidden constant +// FUND-HID: @_ZTSf = hidden constant // FUND-HID: @_ZTIPf = hidden constant -// FUND-HID: @_ZTSPKf = hidden constant +// FUND-HID: @_ZTSPf = hidden constant // FUND-HID: @_ZTIPKf = hidden constant -// FUND-EXP: @_ZTSf = dllexport constant +// FUND-HID: @_ZTSPKf = hidden constant // FUND-EXP: @_ZTIf = dllexport constant -// FUND-EXP: @_ZTSPf = dllexport constant +// FUND-EXP: @_ZTSf = dllexport constant // FUND-EXP: @_ZTIPf = dllexport constant -// FUND-EXP: @_ZTSPKf = dllexport constant +// FUND-EXP: @_ZTSPf = dllexport constant // FUND-EXP: @_ZTIPKf = dllexport constant +// FUND-EXP: @_ZTSPKf = dllexport constant // double -// FUND-DEF: @_ZTSd = constant // FUND-DEF: @_ZTId = constant -// FUND-DEF: @_ZTSPd = constant +// FUND-DEF: @_ZTSd = constant // FUND-DEF: @_ZTIPd = constant -// FUND-DEF: @_ZTSPKd = constant +// FUND-DEF: @_ZTSPd = constant // FUND-DEF: @_ZTIPKd = constant -// FUND-HID: @_ZTSd = hidden constant +// FUND-DEF: @_ZTSPKd = constant // FUND-HID: @_ZTId = hidden constant -// FUND-HID: @_ZTSPd = hidden constant +// FUND-HID: @_ZTSd = hidden constant // FUND-HID: @_ZTIPd = hidden constant -// FUND-HID: @_ZTSPKd = hidden constant +// FUND-HID: @_ZTSPd = hidden constant // FUND-HID: @_ZTIPKd = hidden constant -// FUND-EXP: @_ZTSd = dllexport constant +// FUND-HID: @_ZTSPKd = hidden constant // FUND-EXP: @_ZTId = dllexport constant -// FUND-EXP: @_ZTSPd = dllexport constant +// FUND-EXP: @_ZTSd = dllexport constant // FUND-EXP: @_ZTIPd = dllexport constant -// FUND-EXP: @_ZTSPKd = dllexport constant +// FUND-EXP: @_ZTSPd = dllexport constant // FUND-EXP: @_ZTIPKd = dllexport constant +// FUND-EXP: @_ZTSPKd = dllexport constant // long double -// FUND-DEF: @_ZTSe = constant // FUND-DEF: @_ZTIe = constant -// FUND-DEF: @_ZTSPe = constant +// FUND-DEF: @_ZTSe = constant // FUND-DEF: @_ZTIPe = constant -// FUND-DEF: @_ZTSPKe = constant +// FUND-DEF: @_ZTSPe = constant // FUND-DEF: @_ZTIPKe = constant -// FUND-HID: @_ZTSe = hidden constant +// FUND-DEF: @_ZTSPKe = constant // FUND-HID: @_ZTIe = hidden constant -// FUND-HID: @_ZTSPe = hidden constant +// FUND-HID: @_ZTSe = hidden constant // FUND-HID: @_ZTIPe = hidden constant -// FUND-HID: @_ZTSPKe = hidden constant +// FUND-HID: @_ZTSPe = hidden constant // FUND-HID: @_ZTIPKe = hidden constant -// FUND-EXP: @_ZTSe = dllexport constant +// FUND-HID: @_ZTSPKe = hidden constant // FUND-EXP: @_ZTIe = dllexport constant -// FUND-EXP: @_ZTSPe = dllexport constant +// FUND-EXP: @_ZTSe = dllexport constant // FUND-EXP: @_ZTIPe = dllexport constant -// FUND-EXP: @_ZTSPKe = dllexport constant +// FUND-EXP: @_ZTSPe = dllexport constant // FUND-EXP: @_ZTIPKe = dllexport constant +// FUND-EXP: @_ZTSPKe = dllexport constant // __ieee128 -// FUND-DEF: @_ZTSu9__ieee128 = constant // FUND-DEF: @_ZTIu9__ieee128 = constant -// FUND-DEF: @_ZTSPu9__ieee128 = constant +// FUND-DEF: @_ZTSu9__ieee128 = constant // FUND-DEF: @_ZTIPu9__ieee128 = constant -// FUND-DEF: @_ZTSPKu9__ieee128 = constant +// FUND-DEF: @_ZTSPu9__ieee128 = constant // FUND-DEF: @_ZTIPKu9__ieee128 = constant -// FUND-HID: @_ZTSu9__ieee128 = hidden constant +// FUND-DEF: @_ZTSPKu9__ieee128 = constant // FUND-HID: @_ZTIu9__ieee128 = hidden constant -// FUND-HID: @_ZTSPu9__ieee128 = hidden constant +// FUND-HID: @_ZTSu9__ieee128 = hidden constant // FUND-HID: @_ZTIPu9__ieee128 = hidden constant -// FUND-HID: @_ZTSPKu9__ieee128 = hidden constant +// FUND-HID: @_ZTSPu9__ieee128 = hidden constant // FUND-HID: @_ZTIPKu9__ieee128 = hidden constant -// FUND-EXP: @_ZTSu9__ieee128 = dllexport constant +// FUND-HID: @_ZTSPKu9__ieee128 = hidden constant // FUND-EXP: @_ZTIu9__ieee128 = dllexport constant -// FUND-EXP: @_ZTSPu9__ieee128 = dllexport constant +// FUND-EXP: @_ZTSu9__ieee128 = dllexport constant // FUND-EXP: @_ZTIPu9__ieee128 = dllexport constant -// FUND-EXP: @_ZTSPKu9__ieee128 = dllexport constant +// FUND-EXP: @_ZTSPu9__ieee128 = dllexport constant // FUND-EXP: @_ZTIPKu9__ieee128 = dllexport constant +// FUND-EXP: @_ZTSPKu9__ieee128 = dllexport constant // char8_t -// FUND-DEF: @_ZTSDu = constant // FUND-DEF: @_ZTIDu = constant -// FUND-DEF: @_ZTSPDu = constant +// FUND-DEF: @_ZTSDu = constant // FUND-DEF: @_ZTIPDu = constant -// FUND-DEF: @_ZTSPKDu = constant +// FUND-DEF: @_ZTSPDu = constant // FUND-DEF: @_ZTIPKDu = constant -// FUND-HID: @_ZTSDu = hidden constant +// FUND-DEF: @_ZTSPKDu = constant // FUND-HID: @_ZTIDu = hidden constant -// FUND-HID: @_ZTSPDu = hidden constant +// FUND-HID: @_ZTSDu = hidden constant // FUND-HID: @_ZTIPDu = hidden constant -// FUND-HID: @_ZTSPKDu = hidden constant +// FUND-HID: @_ZTSPDu = hidden constant // FUND-HID: @_ZTIPKDu = hidden constant -// FUND-EXP: @_ZTSDu = dllexport constant +// FUND-HID: @_ZTSPKDu = hidden constant // FUND-EXP: @_ZTIDu = dllexport constant -// FUND-EXP: @_ZTSPDu = dllexport constant +// FUND-EXP: @_ZTSDu = dllexport constant // FUND-EXP: @_ZTIPDu = dllexport constant -// FUND-EXP: @_ZTSPKDu = dllexport constant +// FUND-EXP: @_ZTSPDu = dllexport constant // FUND-EXP: @_ZTIPKDu = dllexport constant +// FUND-EXP: @_ZTSPKDu = dllexport constant // char16_t -// FUND-DEF: @_ZTSDs = constant // FUND-DEF: @_ZTIDs = constant -// FUND-DEF: @_ZTSPDs = constant +// FUND-DEF: @_ZTSDs = constant // FUND-DEF: @_ZTIPDs = constant -// FUND-DEF: @_ZTSPKDs = constant +// FUND-DEF: @_ZTSPDs = constant // FUND-DEF: @_ZTIPKDs = constant -// FUND-HID: @_ZTSDs = hidden constant +// FUND-DEF: @_ZTSPKDs = constant // FUND-HID: @_ZTIDs = hidden constant -// FUND-HID: @_ZTSPDs = hidden constant +// FUND-HID: @_ZTSDs = hidden constant // FUND-HID: @_ZTIPDs = hidden constant -// FUND-HID: @_ZTSPKDs = hidden constant +// FUND-HID: @_ZTSPDs = hidden constant // FUND-HID: @_ZTIPKDs = hidden constant -// FUND-EXP: @_ZTSDs = dllexport constant +// FUND-HID: @_ZTSPKDs = hidden constant // FUND-EXP: @_ZTIDs = dllexport constant -// FUND-EXP: @_ZTSPDs = dllexport constant +// FUND-EXP: @_ZTSDs = dllexport constant // FUND-EXP: @_ZTIPDs = dllexport constant -// FUND-EXP: @_ZTSPKDs = dllexport constant +// FUND-EXP: @_ZTSPDs = dllexport constant // FUND-EXP: @_ZTIPKDs = dllexport constant +// FUND-EXP: @_ZTSPKDs = dllexport constant // char32_t -// FUND-DEF: @_ZTSDi = constant // FUND-DEF: @_ZTIDi = constant -// FUND-DEF: @_ZTSPDi = constant +// FUND-DEF: @_ZTSDi = constant // FUND-DEF: @_ZTIPDi = constant -// FUND-DEF: @_ZTSPKDi = constant +// FUND-DEF: @_ZTSPDi = constant // FUND-DEF: @_ZTIPKDi = constant -// FUND-HID: @_ZTSDi = hidden constant +// FUND-DEF: @_ZTSPKDi = constant // FUND-HID: @_ZTIDi = hidden constant -// FUND-HID: @_ZTSPDi = hidden constant +// FUND-HID: @_ZTSDi = hidden constant // FUND-HID: @_ZTIPDi = hidden constant -// FUND-HID: @_ZTSPKDi = hidden constant +// FUND-HID: @_ZTSPDi = hidden constant // FUND-HID: @_ZTIPKDi = hidden constant -// FUND-EXP: @_ZTSDi = dllexport constant +// FUND-HID: @_ZTSPKDi = hidden constant // FUND-EXP: @_ZTIDi = dllexport constant -// FUND-EXP: @_ZTSPDi = dllexport constant +// FUND-EXP: @_ZTSDi = dllexport constant // FUND-EXP: @_ZTIPDi = dllexport constant -// FUND-EXP: @_ZTSPKDi = dllexport constant +// FUND-EXP: @_ZTSPDi = dllexport constant // FUND-EXP: @_ZTIPKDi = dllexport constant +// FUND-EXP: @_ZTSPKDi = dllexport constant diff --git a/clang/test/CodeGenCXX/modules-vtable.cppm b/clang/test/CodeGenCXX/modules-vtable.cppm index 5cc3504d72628f216d774dc3d021f86a7a4930d1..6589b9f3c5d6491a5e8bc4cfb7d257daa7554354 100644 --- a/clang/test/CodeGenCXX/modules-vtable.cppm +++ b/clang/test/CodeGenCXX/modules-vtable.cppm @@ -40,13 +40,13 @@ inline Base::~Base() {} // CHECK: @_ZTVW3Mod4Base = unnamed_addr constant -// CHECK: @_ZTSW3Mod4Base = constant // CHECK: @_ZTIW3Mod4Base = constant +// CHECK: @_ZTSW3Mod4Base = constant // With the new Itanium C++ ABI, the linkage of vtables in modules don't need to be linkonce ODR. // CHECK-INLINE: @_ZTVW3Mod4Base = {{.*}}unnamed_addr constant -// CHECK-INLINE: @_ZTSW3Mod4Base = {{.*}}constant // CHECK-INLINE: @_ZTIW3Mod4Base = {{.*}}constant +// CHECK-INLINE: @_ZTSW3Mod4Base = {{.*}}constant module :private; int private_use() { @@ -61,12 +61,12 @@ int use() { return 43; } -// CHECK-NOT: @_ZTSW3Mod4Base // CHECK-NOT: @_ZTIW3Mod4Base +// CHECK-NOT: @_ZTSW3Mod4Base // CHECK: @_ZTVW3Mod4Base = external -// CHECK-INLINE-NOT: @_ZTSW3Mod4Base // CHECK-INLINE-NOT: @_ZTIW3Mod4Base +// CHECK-INLINE-NOT: @_ZTSW3Mod4Base // CHECK-INLINE: @_ZTVW3Mod4Base = external // Check the case that the declaration of the key function comes from another @@ -86,8 +86,8 @@ int a_use() { } // CHECK: @_ZTVW1M1C = unnamed_addr constant -// CHECK: @_ZTSW1M1C = constant // CHECK: @_ZTIW1M1C = constant +// CHECK: @_ZTSW1M1C = constant //--- M-B.cppm export module M:B; @@ -101,5 +101,5 @@ int b_use() { } // CHECK: @_ZTVW1M1C = external -// CHECK-NOT: @_ZTSW1M1C // CHECK-NOT: @_ZTIW1M1C +// CHECK-NOT: @_ZTSW1M1C diff --git a/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp index 2b633addd677e004976fdc06a46a8edb5198653a..b50e0908f9db88ee519ae80b7e2bfdc7b37d8293 100644 --- a/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp +++ b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp @@ -5,12 +5,12 @@ struct A { int a; }; +// DARWIN: @_ZTI1A = linkonce_odr hidden constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1A to i64), i64 -9223372036854775808) to ptr) } // DARWIN: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] // DARWIN: @_ZTS1A = linkonce_odr hidden constant [3 x i8] c"1A\00" -// DARWIN: @_ZTI1A = linkonce_odr hidden constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1A to i64), i64 -9223372036854775808) to ptr) } +// ELF: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A } // ELF: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] // ELF: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" -// ELF: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A } auto ATI = typeid(A); diff --git a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp index 174aeda89d1755a9720ef1f439c034a3f4018ec7..f4396e402703999c97883750ad7a8cfb883c1a17 100644 --- a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp +++ b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp @@ -60,12 +60,13 @@ static_assert(__has_feature(ptrauth_type_info_vtable_pointer_discrimination) == extern "C" int disc_std_type_info = __builtin_ptrauth_string_discriminator("_ZTVSt9type_info"); // CHECK: @_ZTV10TestStruct = unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI10TestStruct, ptr ptrauth (ptr @_ZN10TestStructD1Ev, i32 0, i64 52216, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN10TestStructD0Ev, i32 0, i64 39671, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 3))] }, align 8 -// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] -// CHECK: @_ZTS10TestStruct = constant [13 x i8] c"10TestStruct\00", align 1 // NODISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS10TestStruct }, align 8 -// DISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 [[STDTYPEINFO_DISC]]), ptr @_ZTS10TestStruct }, align 8 +// DISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 [[STDTYPEINFO_DISC]], ptr @_ZTI10TestStruct), ptr @_ZTS10TestStruct }, align 8 + +// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] +// CHECK: @_ZTS10TestStruct = constant [13 x i8] c"10TestStruct\00", align 1 struct TestStruct { virtual ~TestStruct(); diff --git a/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp index 031bb48608af7c28b1a6e406378767dfa091e945..b5c15a29eb6b95c895f388d10fefb28ed806cfd1 100644 --- a/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp +++ b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp @@ -94,30 +94,30 @@ // CHECK-SAME: ptr ptrauth (ptr @_ZN1AD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 5)), // CHECK-SAME: ptr ptrauth (ptr @_ZN1AD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 6))] }, align 8 +// CHECK: @_ZTI1A = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }, align 8 + // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] // CHECK: @_ZTS1A = constant [3 x i8] c"1A\00", align 1 -// CHECK: @_ZTI1A = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }, align 8 +// CHECK: @_ZTI1C = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1C, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8 // CHECK: @_ZTVN10__cxxabiv121__vmi_class_type_infoE = external global [0 x ptr] // CHECK: @_ZTS1C = constant [3 x i8] c"1C\00", align 1 +// DARWIN: @_ZTI1B = linkonce_odr hidden constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1B to i64), i64 -9223372036854775808) to ptr), ptr @_ZTI1A }, align 8 +// ELF: @_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8 + // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr] // DARWIN: @_ZTS1B = linkonce_odr hidden constant [3 x i8] c"1B\00", align 1 // ELF: @_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00", comdat, align 1 -// DARWIN: @_ZTI1B = linkonce_odr hidden constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1B to i64), i64 -9223372036854775808) to ptr), ptr @_ZTI1A }, align 8 -// ELF: @_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8 - -// CHECK: @_ZTI1C = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1C, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8 +// CHECK: @_ZTI1D = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1D, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8 // CHECK: @_ZTS1D = constant [3 x i8] c"1D\00", align 1 -// CHECK: @_ZTI1D = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1D, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8 - // CHECK: @_ZTV1E = unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI1E, // CHECK-SAME: ptr ptrauth (ptr @_ZN1E1fEv, i32 0, i64 28408, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 2)), // CHECK-SAME: ptr ptrauth (ptr @_ZN1E1gEv, i32 0, i64 22926, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 3)), @@ -125,10 +125,10 @@ // CHECK-SAME: ptr ptrauth (ptr @_ZN1ED1Ev, i32 0, i64 5817, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 5)), // CHECK-SAME: ptr ptrauth (ptr @_ZN1ED0Ev, i32 0, i64 26464, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 6))] }, align 8 -// CHECK: @_ZTS1E = constant [3 x i8] c"1E\00", align 1 - // CHECK: @_ZTI1E = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1E }, align 8 +// CHECK: @_ZTS1E = constant [3 x i8] c"1E\00", align 1 + // CHECK: @_ZTC1F0_1C = unnamed_addr constant { [5 x ptr], [11 x ptr] } { [5 x ptr] [ptr inttoptr (i64 16 to ptr), ptr null, ptr @_ZTI1C, // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 0, i32 3)), // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 0, i32 4))], [11 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1C, @@ -149,10 +149,10 @@ // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 9)), // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 10))] }, align 8 -// CHECK: @_ZTS1F = constant [3 x i8] c"1F\00", align 1 - // CHECK: @_ZTI1F = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1F, i32 3, i32 3, ptr @_ZTI1C, i64 2, ptr @_ZTI1D, i64 2050, ptr @_ZTI1E, i64 -8189 }, align 8 +// CHECK: @_ZTS1F = constant [3 x i8] c"1F\00", align 1 + // CHECK: @_ZTC1G0_1C = unnamed_addr constant { [5 x ptr], [11 x ptr] } { [5 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr @_ZTI1C, // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 0, i32 3)), // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 0, i32 4))], [11 x ptr] [ptr inttoptr (i64 -24 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -24 to ptr), ptr @_ZTI1C, @@ -173,10 +173,10 @@ // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 9)), // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 10))] }, align 8 -// CHECK: @_ZTS1G = constant [3 x i8] c"1G\00", align 1 - // CHECK: @_ZTI1G = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1G, i32 3, i32 3, ptr @_ZTI1E, i64 -8189, ptr @_ZTI1C, i64 2, ptr @_ZTI1D, i64 2050 }, align 8 +// CHECK: @_ZTS1G = constant [3 x i8] c"1G\00", align 1 + // CHECK: @_ZTV1B = linkonce_odr unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI1B, // CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2)), // CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 3)), diff --git a/clang/test/CodeGenCXX/rtti-linkage.cpp b/clang/test/CodeGenCXX/rtti-linkage.cpp index ca50a1bc6f01a719ff24d072457f6f320c1eea65..03e7cdedd3462ce64d21124ce2797005ba54b59e 100644 --- a/clang/test/CodeGenCXX/rtti-linkage.cpp +++ b/clang/test/CodeGenCXX/rtti-linkage.cpp @@ -3,73 +3,73 @@ #include +// CHECK-BOTH: _ZTIP1C = internal constant // CHECK-BOTH: _ZTSP1C = internal constant -// CHECK-BOTH: _ZTS1C = internal constant // CHECK-BOTH: _ZTI1C = internal constant -// CHECK-BOTH: _ZTIP1C = internal constant -// CHECK-BOTH: _ZTSPP1C = internal constant +// CHECK-BOTH: _ZTS1C = internal constant // CHECK-BOTH: _ZTIPP1C = internal constant -// CHECK-BOTH: _ZTSM1Ci = internal constant +// CHECK-BOTH: _ZTSPP1C = internal constant // CHECK-BOTH: _ZTIM1Ci = internal constant -// CHECK-BOTH: _ZTSPM1Ci = internal constant +// CHECK-BOTH: _ZTSM1Ci = internal constant // CHECK-BOTH: _ZTIPM1Ci = internal constant -// CHECK-BOTH: _ZTSM1CS_ = internal constant +// CHECK-BOTH: _ZTSPM1Ci = internal constant // CHECK-BOTH: _ZTIM1CS_ = internal constant -// CHECK-BOTH: _ZTSM1CPS_ = internal constant +// CHECK-BOTH: _ZTSM1CS_ = internal constant // CHECK-BOTH: _ZTIM1CPS_ = internal constant +// CHECK-BOTH: _ZTSM1CPS_ = internal constant +// CHECK-BOTH: _ZTIM1A1C = internal constant // CHECK-BOTH: _ZTSM1A1C = internal constant -// CHECK: _ZTS1A = linkonce_odr constant -// CHECK-WITH-HIDDEN: _ZTS1A = linkonce_odr hidden constant // CHECK: _ZTI1A = linkonce_odr constant // CHECK-WITH-HIDDEN: _ZTI1A = linkonce_odr hidden constant -// CHECK-BOTH: _ZTIM1A1C = internal constant -// CHECK-BOTH: _ZTSM1AP1C = internal constant +// CHECK: _ZTS1A = linkonce_odr constant +// CHECK-WITH-HIDDEN: _ZTS1A = linkonce_odr hidden constant // CHECK-BOTH: _ZTIM1AP1C = internal constant +// CHECK-BOTH: _ZTSM1AP1C = internal constant // CHECK-WITH-HIDDEN: _ZTSFN12_GLOBAL__N_11DEvE = internal constant -// CHECK-WITH-HIDDEN: @_ZTSPK2T4 = linkonce_odr hidden constant -// CHECK-WITH-HIDDEN: @_ZTS2T4 = linkonce_odr hidden constant -// CHECK-WITH-HIDDEN: @_ZTI2T4 = linkonce_odr hidden constant -// CHECK-WITH-HIDDEN: @_ZTIPK2T4 = linkonce_odr hidden constant -// CHECK-WITH-HIDDEN: @_ZTSZ2t5vE1A = internal constant +// CHECK-WITH-HIDDEN: @_ZTIPK2T4 = linkonce_odr hidden constant +// CHECK-WITH-HIDDEN: @_ZTSPK2T4 = linkonce_odr hidden constant +// CHECK-WITH-HIDDEN: @_ZTI2T4 = linkonce_odr hidden constant +// CHECK-WITH-HIDDEN: @_ZTS2T4 = linkonce_odr hidden constant // CHECK-WITH-HIDDEN: @_ZTIZ2t5vE1A = internal constant -// CHECK-WITH-HIDDEN: @_ZTSZ2t6vE1A = linkonce_odr hidden constant +// CHECK-WITH-HIDDEN: @_ZTSZ2t5vE1A = internal constant // CHECK-WITH-HIDDEN: @_ZTIZ2t6vE1A = linkonce_odr hidden constant +// CHECK-WITH-HIDDEN: @_ZTSZ2t6vE1A = linkonce_odr hidden constant +// CHECK-WITH-HIDDEN: @_ZTIPZ2t7vE1A = linkonce_odr hidden constant // CHECK-WITH-HIDDEN: @_ZTSPZ2t7vE1A = linkonce_odr hidden constant -// CHECK-WITH-HIDDEN: @_ZTSZ2t7vE1A = linkonce_odr hidden constant // CHECK-WITH-HIDDEN: @_ZTIZ2t7vE1A = linkonce_odr hidden constant -// CHECK-WITH-HIDDEN: @_ZTIPZ2t7vE1A = linkonce_odr hidden constant +// CHECK-WITH-HIDDEN: @_ZTSZ2t7vE1A = linkonce_odr hidden constant -// CHECK: _ZTSN12_GLOBAL__N_11DE = internal constant // CHECK: _ZTIN12_GLOBAL__N_11DE = internal constant -// CHECK: _ZTSPN12_GLOBAL__N_11DE = internal constant +// CHECK: _ZTSN12_GLOBAL__N_11DE = internal constant // CHECK: _ZTIPN12_GLOBAL__N_11DE = internal constant -// CHECK: _ZTSFN12_GLOBAL__N_11DEvE = internal constant +// CHECK: _ZTSPN12_GLOBAL__N_11DE = internal constant // CHECK: _ZTIFN12_GLOBAL__N_11DEvE = internal constant -// CHECK: _ZTSFvN12_GLOBAL__N_11DEE = internal constant +// CHECK: _ZTSFN12_GLOBAL__N_11DEvE = internal constant // CHECK: _ZTIFvN12_GLOBAL__N_11DEE = internal constant +// CHECK: _ZTSFvN12_GLOBAL__N_11DEE = internal constant +// CHECK: _ZTIPFvvE = linkonce_odr constant // CHECK: _ZTSPFvvE = linkonce_odr constant -// CHECK: _ZTSFvvE = linkonce_odr constant // CHECK: _ZTIFvvE = linkonce_odr constant -// CHECK: _ZTIPFvvE = linkonce_odr constant -// CHECK: _ZTSN12_GLOBAL__N_11EE = internal constant +// CHECK: _ZTSFvvE = linkonce_odr constant // CHECK: _ZTIN12_GLOBAL__N_11EE = internal constant -// CHECK: _ZTSA10_i = linkonce_odr constant +// CHECK: _ZTSN12_GLOBAL__N_11EE = internal constant // CHECK: _ZTIA10_i = linkonce_odr constant +// CHECK: _ZTSA10_i = linkonce_odr constant // CHECK: _ZTI1TILj0EE = linkonce_odr constant // CHECK: _ZTI1TILj1EE = weak_odr constant // CHECK: _ZTI1TILj2EE = external constant -// CHECK: _ZTSZ2t5vE1A = internal constant // CHECK: _ZTIZ2t5vE1A = internal constant -// CHECK: _ZTS1B ={{.*}} constant +// CHECK: _ZTSZ2t5vE1A = internal constant // CHECK: _ZTI1B ={{.*}} constant +// CHECK: _ZTS1B ={{.*}} constant // CHECK: _ZTS1F = linkonce_odr constant -// CHECK: _ZTSZ2t6vE1A = linkonce_odr constant // CHECK: _ZTIZ2t6vE1A = linkonce_odr constant +// CHECK: _ZTSZ2t6vE1A = linkonce_odr constant +// CHECK: _ZTIPZ2t7vE1A = linkonce_odr constant // CHECK: _ZTSPZ2t7vE1A = linkonce_odr constant -// CHECK: _ZTSZ2t7vE1A = linkonce_odr constant // CHECK: _ZTIZ2t7vE1A = linkonce_odr constant -// CHECK: _ZTIPZ2t7vE1A = linkonce_odr constant +// CHECK: _ZTSZ2t7vE1A = linkonce_odr constant // CHECK: _ZTIN12_GLOBAL__N_11DE diff --git a/clang/test/CodeGenCXX/rtti-visibility.cpp b/clang/test/CodeGenCXX/rtti-visibility.cpp index 5945be5c73a26064c9300b06ffb704b7254bd546..1813fee658c72e242d6e6dc82c2f18da23c00837 100644 --- a/clang/test/CodeGenCXX/rtti-visibility.cpp +++ b/clang/test/CodeGenCXX/rtti-visibility.cpp @@ -6,10 +6,10 @@ namespace Test1 { // A is explicitly marked hidden, so all RTTI data should also be marked hidden. - // CHECK-TEST1: @_ZTSN5Test11AE = linkonce_odr hidden constant // CHECK-TEST1: @_ZTIN5Test11AE = linkonce_odr hidden constant - // CHECK-TEST1: @_ZTSPN5Test11AE = linkonce_odr hidden constant + // CHECK-TEST1: @_ZTSN5Test11AE = linkonce_odr hidden constant // CHECK-TEST1: @_ZTIPN5Test11AE = linkonce_odr hidden constant + // CHECK-TEST1: @_ZTSPN5Test11AE = linkonce_odr hidden constant struct __attribute__((visibility("hidden"))) A { }; void f() { @@ -20,8 +20,8 @@ namespace Test1 { namespace Test2 { // A is weak, so its linkage should be linkoce_odr, but not marked hidden. - // CHECK-TEST2: @_ZTSN5Test21AE = linkonce_odr constant // CHECK-TEST2: @_ZTIN5Test21AE = linkonce_odr constant + // CHECK-TEST2: @_ZTSN5Test21AE = linkonce_odr constant struct A { }; void f() { (void)typeid(A); diff --git a/clang/test/CodeGenCXX/symbol-partition.cpp b/clang/test/CodeGenCXX/symbol-partition.cpp index ecc58e2a847dc137927ae5a391b32b88dac89545..cefeeac63f0147aaa0b68a3c83b8b989073eb2bc 100644 --- a/clang/test/CodeGenCXX/symbol-partition.cpp +++ b/clang/test/CodeGenCXX/symbol-partition.cpp @@ -2,8 +2,8 @@ // CHECK: @gv = {{.*}}, partition "foo" // CHECK: @_ZTV1S = {{.*}}, partition "foo" -// CHECK: @_ZTS1S = {{.*}}, partition "foo" // CHECK: @_ZTI1S = {{.*}}, partition "foo" +// CHECK: @_ZTS1S = {{.*}}, partition "foo" // CHECK: @_Z5ifuncv = {{.*}}, partition "foo" diff --git a/clang/test/CodeGenCXX/type_visibility.cpp b/clang/test/CodeGenCXX/type_visibility.cpp index 13aafcff0fa13e4fcae76d95a3633f80e3747c33..00833e36944df261839eb25f6e3cfe6edb938b10 100644 --- a/clang/test/CodeGenCXX/type_visibility.cpp +++ b/clang/test/CodeGenCXX/type_visibility.cpp @@ -26,12 +26,12 @@ namespace temp0 { template struct B; // FUNS-LABEL: define weak_odr void @_ZN5temp01BINS_1AEE3fooEv( // VARS: @_ZTVN5temp01BINS_1AEEE = weak_odr unnamed_addr constant - // VARS: @_ZTSN5temp01BINS_1AEEE = weak_odr constant // VARS: @_ZTIN5temp01BINS_1AEEE = weak_odr constant + // VARS: @_ZTSN5temp01BINS_1AEEE = weak_odr constant // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp01BINS_1AEE3fooEv( // VARS-HIDDEN: @_ZTVN5temp01BINS_1AEEE = weak_odr hidden unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5temp01BINS_1AEEE = weak_odr hidden constant // VARS-HIDDEN: @_ZTIN5temp01BINS_1AEEE = weak_odr hidden constant + // VARS-HIDDEN: @_ZTSN5temp01BINS_1AEEE = weak_odr hidden constant } namespace temp1 { @@ -43,12 +43,12 @@ namespace temp1 { template struct B; // FUNS-LABEL: define weak_odr void @_ZN5temp11BINS_1AEE3fooEv( // VARS: @_ZTVN5temp11BINS_1AEEE = weak_odr unnamed_addr constant - // VARS: @_ZTSN5temp11BINS_1AEEE = weak_odr constant // VARS: @_ZTIN5temp11BINS_1AEEE = weak_odr constant + // VARS: @_ZTSN5temp11BINS_1AEEE = weak_odr constant // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp11BINS_1AEE3fooEv( // VARS-HIDDEN: @_ZTVN5temp11BINS_1AEEE = weak_odr unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5temp11BINS_1AEEE = weak_odr constant // VARS-HIDDEN: @_ZTIN5temp11BINS_1AEEE = weak_odr constant + // VARS-HIDDEN: @_ZTSN5temp11BINS_1AEEE = weak_odr constant } namespace temp2 { @@ -60,12 +60,12 @@ namespace temp2 { template struct B; // FUNS-LABEL: define weak_odr void @_ZN5temp21BINS_1AEE3fooEv( // VARS: @_ZTVN5temp21BINS_1AEEE = weak_odr unnamed_addr constant - // VARS: @_ZTSN5temp21BINS_1AEEE = weak_odr constant // VARS: @_ZTIN5temp21BINS_1AEEE = weak_odr constant + // VARS: @_ZTSN5temp21BINS_1AEEE = weak_odr constant // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp21BINS_1AEE3fooEv( // VARS-HIDDEN: @_ZTVN5temp21BINS_1AEEE = weak_odr hidden unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5temp21BINS_1AEEE = weak_odr hidden constant // VARS-HIDDEN: @_ZTIN5temp21BINS_1AEEE = weak_odr hidden constant + // VARS-HIDDEN: @_ZTSN5temp21BINS_1AEEE = weak_odr hidden constant } namespace temp3 { @@ -77,12 +77,12 @@ namespace temp3 { template struct B; // FUNS-LABEL: define weak_odr hidden void @_ZN5temp31BINS_1AEE3fooEv( // VARS: @_ZTVN5temp31BINS_1AEEE = weak_odr hidden unnamed_addr constant - // VARS: @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant // VARS: @_ZTIN5temp31BINS_1AEEE = weak_odr hidden constant + // VARS: @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp31BINS_1AEE3fooEv( // VARS-HIDDEN: @_ZTVN5temp31BINS_1AEEE = weak_odr hidden unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant // VARS-HIDDEN: @_ZTIN5temp31BINS_1AEEE = weak_odr hidden constant + // VARS-HIDDEN: @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant } namespace temp4 { @@ -94,12 +94,12 @@ namespace temp4 { template struct B; // FUNS-LABEL: define weak_odr void @_ZN5temp41BINS_1AEE3fooEv( // VARS: @_ZTVN5temp41BINS_1AEEE = weak_odr hidden unnamed_addr constant - // VARS: @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant // VARS: @_ZTIN5temp41BINS_1AEEE = weak_odr hidden constant + // VARS: @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp41BINS_1AEE3fooEv( // VARS-HIDDEN: @_ZTVN5temp41BINS_1AEEE = weak_odr hidden unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant // VARS-HIDDEN: @_ZTIN5temp41BINS_1AEEE = weak_odr hidden constant + // VARS-HIDDEN: @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant } namespace type0 { @@ -110,12 +110,12 @@ namespace type0 { void A::foo() {} // FUNS-LABEL: define void @_ZN5type01A3fooEv( // VARS: @_ZTVN5type01AE = unnamed_addr constant - // VARS: @_ZTSN5type01AE = constant // VARS: @_ZTIN5type01AE = constant + // VARS: @_ZTSN5type01AE = constant // FUNS-HIDDEN-LABEL: define hidden void @_ZN5type01A3fooEv( // VARS-HIDDEN: @_ZTVN5type01AE = unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5type01AE = constant // VARS-HIDDEN: @_ZTIN5type01AE = constant + // VARS-HIDDEN: @_ZTSN5type01AE = constant } namespace type1 { @@ -126,12 +126,12 @@ namespace type1 { void A::foo() {} // FUNS-LABEL: define hidden void @_ZN5type11A3fooEv( // VARS: @_ZTVN5type11AE = unnamed_addr constant - // VARS: @_ZTSN5type11AE = constant // VARS: @_ZTIN5type11AE = constant + // VARS: @_ZTSN5type11AE = constant // FUNS-HIDDEN-LABEL: define hidden void @_ZN5type11A3fooEv( // VARS-HIDDEN: @_ZTVN5type11AE = unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5type11AE = constant // VARS-HIDDEN: @_ZTIN5type11AE = constant + // VARS-HIDDEN: @_ZTSN5type11AE = constant } namespace type2 { @@ -142,12 +142,12 @@ namespace type2 { void A::foo() {} // FUNS-LABEL: define void @_ZN5type21A3fooEv( // VARS: @_ZTVN5type21AE = hidden unnamed_addr constant - // VARS: @_ZTSN5type21AE = hidden constant // VARS: @_ZTIN5type21AE = hidden constant + // VARS: @_ZTSN5type21AE = hidden constant // FUNS-HIDDEN-LABEL: define hidden void @_ZN5type21A3fooEv( // VARS-HIDDEN: @_ZTVN5type21AE = hidden unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5type21AE = hidden constant // VARS-HIDDEN: @_ZTIN5type21AE = hidden constant + // VARS-HIDDEN: @_ZTSN5type21AE = hidden constant } namespace type3 { @@ -158,11 +158,11 @@ namespace type3 { void A::foo() {} // FUNS-LABEL: define void @_ZN5type31A3fooEv( // VARS: @_ZTVN5type31AE = hidden unnamed_addr constant - // VARS: @_ZTSN5type31AE = hidden constant // VARS: @_ZTIN5type31AE = hidden constant + // VARS: @_ZTSN5type31AE = hidden constant // FUNS-HIDDEN-LABEL: define void @_ZN5type31A3fooEv( // VARS-HIDDEN: @_ZTVN5type31AE = hidden unnamed_addr constant - // VARS-HIDDEN: @_ZTSN5type31AE = hidden constant // VARS-HIDDEN: @_ZTIN5type31AE = hidden constant + // VARS-HIDDEN: @_ZTSN5type31AE = hidden constant } diff --git a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp index 60eb8f17f91fd125ab662ed3464deab98646a514..68eb5cb4864765dbf0ace10389627ec7c18fe494 100644 --- a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp +++ b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp @@ -15,12 +15,12 @@ class B : A { // NO-AS: @_ZTISt9type_info = external constant ptr // AS: @_ZTIi = external addrspace(1) constant ptr addrspace(1) // NO-AS: @_ZTIi = external constant ptr +// AS: @_ZTI1A = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1A }, comdat, align 8 +// NO-AS: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, comdat, align 8 // AS: @_ZTVN10__cxxabiv117__class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)] // NO-AS: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] // AS: @_ZTS1A = linkonce_odr addrspace(1) constant [3 x i8] c"1A\00", comdat, align 1 // NO-AS: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00", comdat, align 1 -// AS: @_ZTI1A = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1A }, comdat, align 8 -// NO-AS: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, comdat, align 8 // AS: @_ZTIf = external addrspace(1) constant ptr addrspace(1) // NO-AS: @_ZTIf = external constant ptr diff --git a/clang/test/CodeGenCXX/visibility-ms-compat.cpp b/clang/test/CodeGenCXX/visibility-ms-compat.cpp index 525691358832f8781fc910c9e38f489e8b267bdb..0344803909cd44849641bda18f235da6bf761c17 100644 --- a/clang/test/CodeGenCXX/visibility-ms-compat.cpp +++ b/clang/test/CodeGenCXX/visibility-ms-compat.cpp @@ -24,8 +24,8 @@ namespace test0 { // CHECK: declare void @_ZN5test01A3barEv() const std::type_info &ti = typeid(A); - // CHECK-GLOBAL: @_ZTSN5test01AE = linkonce_odr constant // CHECK-GLOBAL: @_ZTIN5test01AE = linkonce_odr constant + // CHECK-GLOBAL: @_ZTSN5test01AE = linkonce_odr constant // CHECK-GLOBAL: @_ZN5test02tiE = hidden constant } @@ -40,8 +40,8 @@ namespace test1 { // CHECK: declare hidden void @_ZN5test11A3barEv() const std::type_info &ti = typeid(A); - // CHECK-GLOBAL: @_ZTSN5test11AE = linkonce_odr hidden constant // CHECK-GLOBAL: @_ZTIN5test11AE = linkonce_odr hidden constant + // CHECK-GLOBAL: @_ZTSN5test11AE = linkonce_odr hidden constant // CHECK-GLOBAL: @_ZN5test12tiE = hidden constant } @@ -56,8 +56,8 @@ namespace test2 { // CHECK: declare void @_ZN5test21A3barEv() const std::type_info &ti = typeid(A); - // CHECK-GLOBAL: @_ZTSN5test21AE = linkonce_odr constant // CHECK-GLOBAL: @_ZTIN5test21AE = linkonce_odr constant + // CHECK-GLOBAL: @_ZTSN5test21AE = linkonce_odr constant // CHECK-GLOBAL: @_ZN5test22tiE = hidden constant } @@ -73,8 +73,8 @@ namespace test3 { // CHECK: declare void @_ZN5test31BINS_1AEE3barEv() const std::type_info &ti = typeid(B); - // CHECK-GLOBAL: @_ZTSN5test31BINS_1AEEE = linkonce_odr constant // CHECK-GLOBAL: @_ZTIN5test31BINS_1AEEE = linkonce_odr constant + // CHECK-GLOBAL: @_ZTSN5test31BINS_1AEEE = linkonce_odr constant } namespace test4 { @@ -89,8 +89,8 @@ namespace test4 { // CHECK: declare void @_ZN5test41BINS_1AEE3barEv() const std::type_info &ti = typeid(B); - // CHECK-GLOBAL: @_ZTSN5test41BINS_1AEEE = linkonce_odr constant // CHECK-GLOBAL: @_ZTIN5test41BINS_1AEEE = linkonce_odr constant + // CHECK-GLOBAL: @_ZTSN5test41BINS_1AEEE = linkonce_odr constant } namespace test5 { @@ -105,6 +105,6 @@ namespace test5 { // CHECK: declare hidden void @_ZN5test51BINS_1AEE3barEv() const std::type_info &ti = typeid(B); - // CHECK-GLOBAL: @_ZTSN5test51BINS_1AEEE = linkonce_odr hidden constant // CHECK-GLOBAL: @_ZTIN5test51BINS_1AEEE = linkonce_odr hidden constant + // CHECK-GLOBAL: @_ZTSN5test51BINS_1AEEE = linkonce_odr hidden constant } diff --git a/clang/test/CodeGenCXX/vtable-align-address-space.cpp b/clang/test/CodeGenCXX/vtable-align-address-space.cpp index 5eac0bd75dc5efa1049da61e79ccd9c84dee5aff..5eccf0a0d77d829539e741aff176ab53ae3b215b 100644 --- a/clang/test/CodeGenCXX/vtable-align-address-space.cpp +++ b/clang/test/CodeGenCXX/vtable-align-address-space.cpp @@ -9,5 +9,5 @@ struct A { void A::f() {} // CHECK: @_ZTV1A ={{.*}} unnamed_addr addrspace(1) constant { [5 x ptr addrspace(1)] } { [5 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1A, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1A1gEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1A1hEv to ptr addrspace(1))] -// CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 // CHECK: @_ZTI1A ={{.*}} addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1A }, align 8 +// CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 diff --git a/clang/test/CodeGenCXX/vtable-align.cpp b/clang/test/CodeGenCXX/vtable-align.cpp index fb8ff1a582ec8301aee6d811ddd093b4307ef640..f1d5e09b9730b211dd8fae9fc438e2c7f668ccff 100644 --- a/clang/test/CodeGenCXX/vtable-align.cpp +++ b/clang/test/CodeGenCXX/vtable-align.cpp @@ -10,8 +10,8 @@ struct A { void A::f() {} // CHECK-32: @_ZTV1A ={{.*}} unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A1fEv, ptr @_ZN1A1gEv, ptr @_ZN1A1hEv] }, align 4 -// CHECK-32: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 // CHECK-32: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 2), ptr @_ZTS1A }, align 4 +// CHECK-32: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 // CHECK-64: @_ZTV1A ={{.*}} unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A1fEv, ptr @_ZN1A1gEv, ptr @_ZN1A1hEv] }, align 8 -// CHECK-64: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 // CHECK-64: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, align 8 +// CHECK-64: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1 diff --git a/clang/test/CodeGenCXX/vtable-available-externally.cpp b/clang/test/CodeGenCXX/vtable-available-externally.cpp index ab105260bc75aa0925c56d14840615e12d9f18df..4415e24f3f1cb60ac5cb087a497cae15b5b5ab98 100644 --- a/clang/test/CodeGenCXX/vtable-available-externally.cpp +++ b/clang/test/CodeGenCXX/vtable-available-externally.cpp @@ -49,8 +49,8 @@ void g() { // This tests mainly that the typeinfo and typename constants have their linkage // updated correctly. -// CHECK-TEST2: @_ZTSN5Test21AE ={{.*}} constant // CHECK-TEST2: @_ZTIN5Test21AE ={{.*}} constant +// CHECK-TEST2: @_ZTSN5Test21AE ={{.*}} constant // CHECK-TEST2: @_ZTVN5Test21AE ={{.*}} unnamed_addr constant namespace Test2 { struct A { diff --git a/clang/test/CodeGenCXX/vtable-key-function-arm.cpp b/clang/test/CodeGenCXX/vtable-key-function-arm.cpp index a054fd87c8ea73f1578e98752dd64e1a7933916b..83889bf9f8dbc5a8536dfe5b98459a06c3a7622a 100644 --- a/clang/test/CodeGenCXX/vtable-key-function-arm.cpp +++ b/clang/test/CodeGenCXX/vtable-key-function-arm.cpp @@ -90,8 +90,8 @@ struct Test2a { // V-table should be defined with strong linkage. Test2a::Test2a() { use(typeid(Test2a)); } // CHECK: @_ZTV6Test2a ={{.*}} unnamed_addr constant -// CHECK-LATE: @_ZTS6Test2a ={{.*}} constant // CHECK-LATE: @_ZTI6Test2a ={{.*}} constant +// CHECK-LATE: @_ZTS6Test2a ={{.*}} constant // 'bar' becomes the key function when 'foo' is defined inline. void Test2a::bar() {} @@ -111,8 +111,8 @@ void Test2b::bar() {} // V-table should be defined with strong linkage. Test2b::Test2b() { use(typeid(Test2b)); } // CHECK: @_ZTV6Test2b ={{.*}} unnamed_addr constant -// CHECK-LATE: @_ZTS6Test2b ={{.*}} constant // CHECK-LATE: @_ZTI6Test2b ={{.*}} constant +// CHECK-LATE: @_ZTS6Test2b ={{.*}} constant inline void Test2b::foo() {} @@ -131,8 +131,8 @@ inline void Test2c::foo() {} // V-table should be defined with strong linkage. Test2c::Test2c() { use(typeid(Test2c)); } // CHECK: @_ZTV6Test2c ={{.*}} unnamed_addr constant -// CHECK: @_ZTS6Test2c ={{.*}} constant // CHECK: @_ZTI6Test2c ={{.*}} constant +// CHECK: @_ZTS6Test2c ={{.*}} constant /*** Test3a ******************************************************************/ @@ -145,8 +145,8 @@ struct Test3a { // V-table should be defined with weak linkage. Test3a::Test3a() { use(typeid(Test3a)); } // CHECK: @_ZTV6Test3a = linkonce_odr unnamed_addr constant -// CHECK-LATE: @_ZTS6Test3a = linkonce_odr constant // CHECK-LATE: @_ZTI6Test3a = linkonce_odr constant +// CHECK-LATE: @_ZTS6Test3a = linkonce_odr constant // There ceases to be a key function after these declarations. inline void Test3a::bar() {} @@ -166,8 +166,8 @@ inline void Test3b::bar() {} // V-table should be defined with weak linkage. Test3b::Test3b() { use(typeid(Test3b)); } // CHECK: @_ZTV6Test3b = linkonce_odr unnamed_addr constant -// CHECK-LATE: @_ZTS6Test3b = linkonce_odr constant // CHECK-LATE: @_ZTI6Test3b = linkonce_odr constant +// CHECK-LATE: @_ZTS6Test3b = linkonce_odr constant inline void Test3b::foo() {} @@ -186,8 +186,8 @@ inline void Test3c::foo() {} // V-table should be defined with weak linkage. Test3c::Test3c() { use(typeid(Test3c)); } // CHECK: @_ZTV6Test3c = linkonce_odr unnamed_addr constant -// CHECK: @_ZTS6Test3c = linkonce_odr constant // CHECK: @_ZTI6Test3c = linkonce_odr constant +// CHECK: @_ZTS6Test3c = linkonce_odr constant /*** Test4a ******************************************************************/ @@ -200,8 +200,8 @@ template struct Test4a { // V-table should be defined with weak linkage. template <> Test4a::Test4a() { use(typeid(Test4a)); } // CHECK: @_ZTV6Test4aIiE = linkonce_odr unnamed_addr constant -// CHECK: @_ZTS6Test4aIiE = linkonce_odr constant // CHECK: @_ZTI6Test4aIiE = linkonce_odr constant +// CHECK: @_ZTS6Test4aIiE = linkonce_odr constant // There ceases to be a key function after these declarations. template <> inline void Test4a::bar() {} @@ -221,8 +221,8 @@ template <> inline void Test4b::bar() {} // V-table should be defined with weak linkage. template <> Test4b::Test4b() { use(typeid(Test4b)); } // CHECK: @_ZTV6Test4bIiE = linkonce_odr unnamed_addr constant -// CHECK: @_ZTS6Test4bIiE = linkonce_odr constant // CHECK: @_ZTI6Test4bIiE = linkonce_odr constant +// CHECK: @_ZTS6Test4bIiE = linkonce_odr constant template <> inline void Test4b::foo() {} @@ -241,8 +241,8 @@ template <> inline void Test4c::foo() {} // V-table should be defined with weak linkage. template <> Test4c::Test4c() { use(typeid(Test4c)); } // CHECK: @_ZTV6Test4cIiE = linkonce_odr unnamed_addr constant -// CHECK: @_ZTS6Test4cIiE = linkonce_odr constant // CHECK: @_ZTI6Test4cIiE = linkonce_odr constant +// CHECK: @_ZTS6Test4cIiE = linkonce_odr constant /*** Test5a ******************************************************************/ @@ -258,8 +258,8 @@ template <> inline void Test5a::foo(); // V-table should be defined with weak linkage. template <> Test5a::Test5a() { use(typeid(Test5a)); } // CHECK: @_ZTV6Test5aIiE = linkonce_odr unnamed_addr constant -// CHECK: @_ZTS6Test5aIiE = linkonce_odr constant // CHECK: @_ZTI6Test5aIiE = linkonce_odr constant +// CHECK: @_ZTS6Test5aIiE = linkonce_odr constant // There ceases to be a key function after these declarations. template <> inline void Test5a::bar() {} @@ -280,8 +280,8 @@ template <> inline void Test5b::bar() {} // V-table should be defined with weak linkage. template <> Test5b::Test5b() { use(typeid(Test5b)); } // CHECK: @_ZTV6Test5bIiE = linkonce_odr unnamed_addr constant -// CHECK: @_ZTS6Test5bIiE = linkonce_odr constant // CHECK: @_ZTI6Test5bIiE = linkonce_odr constant +// CHECK: @_ZTS6Test5bIiE = linkonce_odr constant template <> inline void Test5a::foo(); template <> inline void Test5b::foo() {} @@ -303,5 +303,5 @@ template <> inline void Test5c::foo() {} // V-table should be defined with weak linkage. template <> Test5c::Test5c() { use(typeid(Test5c)); } // CHECK: @_ZTV6Test5cIiE = linkonce_odr unnamed_addr constant -// CHECK: @_ZTS6Test5cIiE = linkonce_odr constant // CHECK: @_ZTI6Test5cIiE = linkonce_odr constant +// CHECK: @_ZTS6Test5cIiE = linkonce_odr constant diff --git a/clang/test/CodeGenCXX/vtable-key-function-ios.cpp b/clang/test/CodeGenCXX/vtable-key-function-ios.cpp index ff2793ad51f948f3025a467a21e3e761de8cc442..43abfb62c73a6c2296e510e31d3242e1d8eedfb1 100644 --- a/clang/test/CodeGenCXX/vtable-key-function-ios.cpp +++ b/clang/test/CodeGenCXX/vtable-key-function-ios.cpp @@ -63,8 +63,8 @@ struct Test1a { // V-table needs to be defined weakly. Test1a::Test1a() { use(typeid(Test1a)); } // CHECK: @_ZTV6Test1a = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK-LATE: @_ZTS6Test1a = linkonce_odr {{(dso_local )?}}constant // CHECK-LATE: @_ZTI6Test1a = linkonce_odr {{(dso_local )?}}constant +// CHECK-LATE: @_ZTS6Test1a = linkonce_odr {{(dso_local )?}}constant // This defines the key function. inline void Test1a::foo() {} @@ -83,8 +83,8 @@ inline void Test1b::foo() {} // V-table should be defined weakly.. Test1b::Test1b() { use(typeid(Test1b)); } // CHECK: @_ZTV6Test1b = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK: @_ZTS6Test1b = linkonce_odr {{(dso_local )?}}constant // CHECK: @_ZTI6Test1b = linkonce_odr {{(dso_local )?}}constant +// CHECK: @_ZTS6Test1b = linkonce_odr {{(dso_local )?}}constant /*** Test2a ******************************************************************/ @@ -97,8 +97,8 @@ struct Test2a { // V-table should be defined with weak linkage. Test2a::Test2a() { use(typeid(Test2a)); } // CHECK: @_ZTV6Test2a = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK-LATE: @_ZTS6Test2a = linkonce_odr {{(dso_local )?}}constant // CHECK-LATE: @_ZTI6Test2a = linkonce_odr {{(dso_local )?}}constant +// CHECK-LATE: @_ZTS6Test2a = linkonce_odr {{(dso_local )?}}constant void Test2a::bar() {} inline void Test2a::foo() {} @@ -116,8 +116,8 @@ void Test2b::bar() {} // V-table should be defined with weak linkage. Test2b::Test2b() { use(typeid(Test2b)); } // CHECK: @_ZTV6Test2b = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK-LATE: @_ZTS6Test2b = linkonce_odr {{(dso_local )?}}constant // CHECK-LATE: @_ZTI6Test2b = linkonce_odr {{(dso_local )?}}constant +// CHECK-LATE: @_ZTS6Test2b = linkonce_odr {{(dso_local )?}}constant inline void Test2b::foo() {} @@ -135,8 +135,8 @@ inline void Test2c::foo() {} // V-table should be defined with weak linkage. Test2c::Test2c() { use(typeid(Test2c)); } // CHECK: @_ZTV6Test2c = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK: @_ZTS6Test2c = linkonce_odr {{(dso_local )?}}constant // CHECK: @_ZTI6Test2c = linkonce_odr {{(dso_local )?}}constant +// CHECK: @_ZTS6Test2c = linkonce_odr {{(dso_local )?}}constant /*** Test3a ******************************************************************/ @@ -149,8 +149,8 @@ struct Test3a { // V-table should be defined with weak linkage. Test3a::Test3a() { use(typeid(Test3a)); } // CHECK: @_ZTV6Test3a = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK-LATE: @_ZTS6Test3a = linkonce_odr {{(dso_local )?}}constant // CHECK-LATE: @_ZTI6Test3a = linkonce_odr {{(dso_local )?}}constant +// CHECK-LATE: @_ZTS6Test3a = linkonce_odr {{(dso_local )?}}constant // This defines the key function. inline void Test3a::bar() {} @@ -169,8 +169,8 @@ inline void Test3b::bar() {} // V-table should be defined with weak linkage. Test3b::Test3b() { use(typeid(Test3b)); } // CHECK: @_ZTV6Test3b = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK-LATE: @_ZTS6Test3b = linkonce_odr {{(dso_local )?}}constant // CHECK-LATE: @_ZTI6Test3b = linkonce_odr {{(dso_local )?}}constant +// CHECK-LATE: @_ZTS6Test3b = linkonce_odr {{(dso_local )?}}constant // This defines the key function. inline void Test3b::foo() {} @@ -190,5 +190,5 @@ inline void Test3c::foo() {} // V-table should be defined with weak linkage. Test3c::Test3c() { use(typeid(Test3c)); } // CHECK: @_ZTV6Test3c = linkonce_odr {{(dso_local )?}}unnamed_addr constant -// CHECK: @_ZTS6Test3c = linkonce_odr {{(dso_local )?}}constant // CHECK: @_ZTI6Test3c = linkonce_odr {{(dso_local )?}}constant +// CHECK: @_ZTS6Test3c = linkonce_odr {{(dso_local )?}}constant diff --git a/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp b/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp index dd4fd9f8754a8d309ba726041743ff8b55eb6c3b..b3de2f63499995bc8252ceceac5d67df6a05eee0 100644 --- a/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp +++ b/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp @@ -15,11 +15,11 @@ Test1a::Test1a() { use(typeid(Test1a)); } inline void Test1a::foo() {} // CHECK: $_ZTV6Test1a = comdat any -// CHECK: $_ZTS6Test1a = comdat any // CHECK: $_ZTI6Test1a = comdat any -// CHECK-NOT: $_ZTS6Test1a.1 = comdat any +// CHECK: $_ZTS6Test1a = comdat any // CHECK-NOT: $_ZTI6Test1a.1 = comdat any +// CHECK-NOT: $_ZTS6Test1a.1 = comdat any // CHECK: @_ZTV6Test1a = linkonce_odr dso_local unnamed_addr constant {{.*}} ptr @_ZTI6Test1a -// CHECK: @_ZTS6Test1a = linkonce_odr dso_local constant // CHECK: @_ZTI6Test1a = linkonce_odr dso_local constant {{.*}} ptr @_ZTS6Test1a +// CHECK: @_ZTS6Test1a = linkonce_odr dso_local constant diff --git a/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp b/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp index 932d36f4abbd2b187b3c3a042af68cdd78ff5bce..8c948d16c90ec543b471f1dc3f123b21941e3b19 100644 --- a/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp +++ b/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp @@ -30,17 +30,17 @@ class V2 : public virtual V1 { void V1::foo() { } void V2::foo() { } -// CHECK: @_ZTS1A = weak_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTI1A = weak_odr {{(dso_local |hidden )?}}constant -// CHECK: @_ZTS1B = weak_odr {{(dso_local |hidden )?}}constant +// CHECK: @_ZTS1A = weak_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTI1B = weak_odr {{(dso_local |hidden )?}}constant +// CHECK: @_ZTS1B = weak_odr {{(dso_local |hidden )?}}constant +// CHECK: @_ZTI1C = weak_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTS1C = weak_odr {{(dso_local |hidden )?}}constant -// CHECK: @_ZTS2T1 = linkonce_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTI2T1 = linkonce_odr {{(dso_local |hidden )?}}constant -// CHECK: @_ZTS1T = linkonce_odr {{(dso_local |hidden )?}}constant +// CHECK: @_ZTS2T1 = linkonce_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTI1T = linkonce_odr {{(dso_local |hidden )?}}constant -// CHECK: @_ZTI1C = weak_odr {{(dso_local |hidden )?}}constant -// CHECK: @_ZTS2V1 = weak_odr {{(dso_local |hidden )?}}constant +// CHECK: @_ZTS1T = linkonce_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTI2V1 = weak_odr {{(dso_local |hidden )?}}constant -// CHECK: @_ZTS2V2 = weak_odr {{(dso_local |hidden )?}}constant +// CHECK: @_ZTS2V1 = weak_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTI2V2 = weak_odr {{(dso_local |hidden )?}}constant +// CHECK: @_ZTS2V2 = weak_odr {{(dso_local |hidden )?}}constant diff --git a/clang/test/CodeGenCXX/windows-itanium-type-info.cpp b/clang/test/CodeGenCXX/windows-itanium-type-info.cpp index 20bd78df5098372ba44814387f183e5ea97dbdaa..95b7b3a4b29e2564af66b92675f666164706966e 100644 --- a/clang/test/CodeGenCXX/windows-itanium-type-info.cpp +++ b/clang/test/CodeGenCXX/windows-itanium-type-info.cpp @@ -33,8 +33,8 @@ void f() { // CHECK-DAG: @_ZTI4base = external dllimport constant -// CHECK-EH-IMPORT: @_ZTS4base = linkonce_odr dso_local constant // CHECK-EH-IMPORT: @_ZTI4base = linkonce_odr dso_local constant +// CHECK-EH-IMPORT: @_ZTS4base = linkonce_odr dso_local constant struct __declspec(dllimport) gatekeeper {}; struct zuul : gatekeeper { diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl index 19699dcf14d9f4f108b1e5be208699d47b6d99af..3949f7b943cfe04920eb38c4409444a119b180a7 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl @@ -1,19 +1,25 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL +// FIXME: SPIR-V codegen of llvm.spv.handle.fromBinding is not yet implemented +// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -// XFAIL: * -// This expectedly fails because create.handle is no longer called -// from RWBuffer constructor and the replacement has not been -// implemented yet. This test should be updated to expect -// dx.create.handleFromBinding as part of issue #105076. +// NOTE: SPIRV codegen for resource types is not yet implemented -RWBuffer Buf; +RWBuffer Buf : register(u5, space3); -// CHECK: define linkonce_odr noundef ptr @"??0?$RWBuffer@M@hlsl@@QAA@XZ" +// CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", float, 1, 0, 0), float } +// CHECK: @Buf = global %"class.hlsl::RWBuffer" zeroinitializer, align 4 + +// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(8) %this) // CHECK-NEXT: entry: -// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1) -// CHECK: store ptr %[[HandleRes]], ptr %h, align 4 +// CHECK: define internal void @_GLOBAL__sub_I_RWBuffer_constructor.hlsl() +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @__cxx_global_var_init() +// CHECK-NEXT: call void @_init_resource_bindings() -// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1) -// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8 +// CHECK: define internal void @_init_resource_bindings() { +// CHECK-NEXT: entry: +// CHECK-DXIL-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) +// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4 +// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.spv.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) +// CHECK-SPIRV-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4 diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl index eca4f1598fd65810bfe258513ca409ecbb0d284f..fa81b53fd9bddcb1215fb4e971c77326f69bd8be 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl @@ -1,5 +1,23 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s +// NOTE: The type name number and whether the struct is packed or not will mostly +// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) +// and theinterim field of the contained type is removed. + +// CHECK: %"class.hlsl::RWBuffer" = type <{ target("dx.TypedBuffer", i16, 1, 0, 1) +// CHECK: %"class.hlsl::RWBuffer.0" = type <{ target("dx.TypedBuffer", i16, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.2" = type { target("dx.TypedBuffer", i32, 1, 0, 1) +// CHECK: %"class.hlsl::RWBuffer.3" = type { target("dx.TypedBuffer", i32, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.4" = type { target("dx.TypedBuffer", i64, 1, 0, 1) +// CHECK: %"class.hlsl::RWBuffer.5" = type { target("dx.TypedBuffer", i64, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.6" = type <{ target("dx.TypedBuffer", half, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.8" = type { target("dx.TypedBuffer", float, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.9" = type { target("dx.TypedBuffer", double, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.10" = type { target("dx.TypedBuffer", <4 x i16>, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.11" = type { target("dx.TypedBuffer", <3 x i32>, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.12" = type { target("dx.TypedBuffer", <2 x half>, 1, 0, 0) +// CHECK: %"class.hlsl::RWBuffer.13" = type { target("dx.TypedBuffer", <3 x float>, 1, 0, 0) + RWBuffer BufI16; RWBuffer BufU16; RWBuffer BufI32; diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl index 4d3d4908c396e6d7a3fd1dbe70bfe00299b0ff00..81c5837d8f2077e31405a409d1162850feed1829 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl @@ -1,22 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s - -StructuredBuffer Buffer1; -StructuredBuffer > BufferArray[4]; - -StructuredBuffer Buffer2 : register(u3); -StructuredBuffer > BufferArray2[4] : register(u4); - -StructuredBuffer Buffer3 : register(u3, space1); -StructuredBuffer > BufferArray3[4] : register(u4, space1); - -[numthreads(1,1,1)] -void main() { -} - -// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s + +StructuredBuffer Buffer1; +StructuredBuffer > BufferArray[4]; + +StructuredBuffer Buffer2 : register(u3); +StructuredBuffer > BufferArray2[4] : register(u4); + +StructuredBuffer Buffer3 : register(u3, space1); +StructuredBuffer > BufferArray3[4] : register(u4, space1); + +[numthreads(1,1,1)] +void main() { +} + +// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} +// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} +// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} +// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} +// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl index 178332d03e64047eb07740992c2ed33e21687a46..4dbca9bc0a4d9360bf47192cc528208a1610717b 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl @@ -1,19 +1,24 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV - -// XFAIL: * -// This expectedly fails because create.handle is no longer invoked -// from StructuredBuffer constructor and the replacement has not been -// implemented yet. This test should be updated to expect -// dx.create.handleFromBinding as part of issue #105076. - -StructuredBuffer Buf; - -// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ" -// CHECK-NEXT: entry: - -// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1) -// CHECK: store ptr %[[HandleRes]], ptr %h, align 4 - -// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1) -// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8 +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL +// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV + +// NOTE: SPIRV codegen for resource types is not yet implemented + +StructuredBuffer Buf : register(u10); + +// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), float } +// CHECK: @Buf = global %"class.hlsl::StructuredBuffer" zeroinitializer, align 4 + +// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(8) %this) +// CHECK-NEXT: entry: + +// CHECK: define internal void @_GLOBAL__sub_I_StructuredBuffer_constructor.hlsl() +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @__cxx_global_var_init() +// CHECK-NEXT: call void @_init_resource_bindings() + +// CHECK: define internal void @_init_resource_bindings() { +// CHECK-NEXT: entry: +// CHECK-DXIL-NEXT: %Buf_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 10, i32 1, i32 0, i1 false) +// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf_h, ptr @Buf, align 4 +// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 10, i32 1, i32 0, i1 false) +// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf_h, ptr @Buf", align 4 diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl index 326885efbeeabaa329bd5b416a3e68578a86ff86..435a904327a26ab73e6fa986a97ba64bfa79aade 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl @@ -1,52 +1,70 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s - -StructuredBuffer BufI16; -StructuredBuffer BufU16; -StructuredBuffer BufI32; -StructuredBuffer BufU32; -StructuredBuffer BufI64; -StructuredBuffer BufU64; -StructuredBuffer BufF16; -StructuredBuffer BufF32; -StructuredBuffer BufF64; -StructuredBuffer< vector > BufI16x4; -StructuredBuffer< vector > BufU32x3; -StructuredBuffer BufF16x2; -StructuredBuffer BufF32x3; -// TODO: StructuredBuffer BufSNormF16; -> 11 -// TODO: StructuredBuffer BufUNormF16; -> 12 -// TODO: StructuredBuffer BufSNormF32; -> 13 -// TODO: StructuredBuffer BufUNormF32; -> 14 -// TODO: StructuredBuffer BufSNormF64; -> 15 -// TODO: StructuredBuffer BufUNormF64; -> 16 - -[numthreads(1,1,1)] -void main(int GI : SV_GroupIndex) { - BufI16[GI] = 0; - BufU16[GI] = 0; - BufI32[GI] = 0; - BufU32[GI] = 0; - BufI64[GI] = 0; - BufU64[GI] = 0; - BufF16[GI] = 0; - BufF32[GI] = 0; - BufF64[GI] = 0; - BufI16x4[GI] = 0; - BufU32x3[GI] = 0; - BufF16x2[GI] = 0; - BufF32x3[GI] = 0; -} - -// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, -// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, -// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, -// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, -// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s + +// NOTE: The number in type name and whether the struct is packed or not will mostly +// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) +// and theinterim field of the contained type is removed. + +// CHECK: %"class.hlsl::StructuredBuffer" = type <{ target("dx.RawBuffer", i16, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.0" = type <{ target("dx.RawBuffer", i16, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.3" = type { target("dx.RawBuffer", i32, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.5" = type { target("dx.RawBuffer", i64, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.6" = type <{ target("dx.RawBuffer", half, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.8" = type { target("dx.RawBuffer", float, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.9" = type { target("dx.RawBuffer", double, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.10" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.11" = type { target("dx.RawBuffer", <3 x i32>, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.12" = type { target("dx.RawBuffer", <2 x half>, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.13" = type { target("dx.RawBuffer", <3 x float>, 1, 0) + +StructuredBuffer BufI16; +StructuredBuffer BufU16; +StructuredBuffer BufI32; +StructuredBuffer BufU32; +StructuredBuffer BufI64; +StructuredBuffer BufU64; +StructuredBuffer BufF16; +StructuredBuffer BufF32; +StructuredBuffer BufF64; +StructuredBuffer< vector > BufI16x4; +StructuredBuffer< vector > BufU32x3; +StructuredBuffer BufF16x2; +StructuredBuffer BufF32x3; +// TODO: StructuredBuffer BufSNormF16; -> 11 +// TODO: StructuredBuffer BufUNormF16; -> 12 +// TODO: StructuredBuffer BufSNormF32; -> 13 +// TODO: StructuredBuffer BufUNormF32; -> 14 +// TODO: StructuredBuffer BufSNormF64; -> 15 +// TODO: StructuredBuffer BufUNormF64; -> 16 + +[numthreads(1,1,1)] +void main(int GI : SV_GroupIndex) { + BufI16[GI] = 0; + BufU16[GI] = 0; + BufI32[GI] = 0; + BufU32[GI] = 0; + BufI64[GI] = 0; + BufU64[GI] = 0; + BufF16[GI] = 0; + BufF32[GI] = 0; + BufF64[GI] = 0; + BufI16x4[GI] = 0; + BufU32x3[GI] = 0; + BufF16x2[GI] = 0; + BufF32x3[GI] = 0; +} + +// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, +// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, +// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, +// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, +// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl index 155749ec4f94a9cb800dc11e6505dbc8fffa9206..89bde9236288fc34b56e915fbd96e51d132adb86 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl @@ -1,17 +1,17 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s - -StructuredBuffer In; -StructuredBuffer Out; - -[numthreads(1,1,1)] -void main(unsigned GI : SV_GroupIndex) { - Out[GI] = In[GI]; -} - -// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy -// and confusing to follow so the match here is pretty weak. - -// CHECK: define void @main() -// Verify inlining leaves only calls to "llvm." intrinsics -// CHECK-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} -// CHECK: ret void +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s + +StructuredBuffer In; +StructuredBuffer Out; + +[numthreads(1,1,1)] +void main(unsigned GI : SV_GroupIndex) { + Out[GI] = In[GI]; +} + +// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy +// and confusing to follow so the match here is pretty weak. + +// CHECK: define void @main() +// Verify inlining leaves only calls to "llvm." intrinsics +// CHECK-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} +// CHECK: ret void diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl new file mode 100644 index 0000000000000000000000000000000000000000..093a199a32bdc840215d2347a3254efa89ef3ee4 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl @@ -0,0 +1,74 @@ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \ +// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \ +// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV + +// Test basic lowering to runtime function call for int values. + +// CHECK-LABEL: test_int +int test_int(int expr, uint idx) { + // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() + // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) + // CHECK: ret [[TY]] %[[RET]] + return WaveReadLaneAt(expr, idx); +} + +// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i32([[TY]], i32) #[[#attr:]] +// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]] + +#ifdef __HLSL_ENABLE_16_BIT +// CHECK-LABEL: test_int16 +int16_t test_int16(int16_t expr, uint idx) { + // CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry() + // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ] + // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) + // CHECK: ret [[TY]] %[[RET]] + return WaveReadLaneAt(expr, idx); +} + +// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i16([[TY]], i32) #[[#attr:]] +// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]] +#endif + +// Test basic lowering to runtime function call with array and float values. + +// CHECK-LABEL: test_half +half test_half(half expr, uint idx) { + // CHECK-SPIRV: %[[#entry_tok2:]] = call token @llvm.experimental.convergence.entry() + // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ] + // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) + // CHECK: ret [[TY]] %[[RET]] + return WaveReadLaneAt(expr, idx); +} + +// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f16([[TY]], i32) #[[#attr:]] +// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]] + +// CHECK-LABEL: test_double +double test_double(double expr, uint idx) { + // CHECK-SPIRV: %[[#entry_tok3:]] = call token @llvm.experimental.convergence.entry() + // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ] + // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) + // CHECK: ret [[TY]] %[[RET]] + return WaveReadLaneAt(expr, idx); +} + +// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f64([[TY]], i32) #[[#attr:]] +// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]] + +// CHECK-LABEL: test_floatv4 +float4 test_floatv4(float4 expr, uint idx) { + // CHECK-SPIRV: %[[#entry_tok4:]] = call token @llvm.experimental.convergence.entry() + // CHECK-SPIRV: %[[RET1:.*]] = call spir_func [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ] + // CHECK-DXIL: %[[RET1:.*]] = call [[TY1:.*]] @llvm.dx.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) + // CHECK: ret [[TY1]] %[[RET1]] + return WaveReadLaneAt(expr, idx); +} + +// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.readlane.v4f32([[TY1]], i32) #[[#attr]] +// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]] + +// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}} diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl index 40796052e608fe914e84bd091e554be96e55e08c..ada269db2f00d381759e969af909429eacf197b5 100644 --- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl +++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl @@ -1,59 +1,59 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF - -// CHECK-LABEL: test_atan2_half -// NATIVE_HALF: call half @llvm.atan2.f16 -// NO_HALF: call float @llvm.atan2.f32 -half test_atan2_half (half p0, half p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_half2 -// NATIVE_HALF: call <2 x half> @llvm.atan2.v2f16 -// NO_HALF: call <2 x float> @llvm.atan2.v2f32 -half2 test_atan2_half2 (half2 p0, half2 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_half3 -// NATIVE_HALF: call <3 x half> @llvm.atan2.v3f16 -// NO_HALF: call <3 x float> @llvm.atan2.v3f32 -half3 test_atan2_half3 (half3 p0, half3 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_half4 -// NATIVE_HALF: call <4 x half> @llvm.atan2.v4f16 -// NO_HALF: call <4 x float> @llvm.atan2.v4f32 -half4 test_atan2_half4 (half4 p0, half4 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float -// CHECK: call float @llvm.atan2.f32 -float test_atan2_float (float p0, float p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float2 -// CHECK: call <2 x float> @llvm.atan2.v2f32 -float2 test_atan2_float2 (float2 p0, float2 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float3 -// CHECK: call <3 x float> @llvm.atan2.v3f32 -float3 test_atan2_float3 (float3 p0, float3 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float4 -// CHECK: call <4 x float> @llvm.atan2.v4f32 -float4 test_atan2_float4 (float4 p0, float4 p1) { - return atan2(p0, p1); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF + +// CHECK-LABEL: test_atan2_half +// NATIVE_HALF: call half @llvm.atan2.f16 +// NO_HALF: call float @llvm.atan2.f32 +half test_atan2_half (half p0, half p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_half2 +// NATIVE_HALF: call <2 x half> @llvm.atan2.v2f16 +// NO_HALF: call <2 x float> @llvm.atan2.v2f32 +half2 test_atan2_half2 (half2 p0, half2 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_half3 +// NATIVE_HALF: call <3 x half> @llvm.atan2.v3f16 +// NO_HALF: call <3 x float> @llvm.atan2.v3f32 +half3 test_atan2_half3 (half3 p0, half3 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_half4 +// NATIVE_HALF: call <4 x half> @llvm.atan2.v4f16 +// NO_HALF: call <4 x float> @llvm.atan2.v4f32 +half4 test_atan2_half4 (half4 p0, half4 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float +// CHECK: call float @llvm.atan2.f32 +float test_atan2_float (float p0, float p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float2 +// CHECK: call <2 x float> @llvm.atan2.v2f32 +float2 test_atan2_float2 (float2 p0, float2 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float3 +// CHECK: call <3 x float> @llvm.atan2.v3f32 +float3 test_atan2_float3 (float3 p0, float3 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float4 +// CHECK: call <4 x float> @llvm.atan2.v4f32 +float4 test_atan2_float4 (float4 p0, float4 p1) { + return atan2(p0, p1); +} diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl index 514e57d36b2016101172970733a68beb25e27c0e..eba710c905bf46c063213db1b3fa42ea4f042507 100644 --- a/clang/test/CodeGenHLSL/builtins/cross.hlsl +++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl @@ -1,37 +1,37 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv - -// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ -// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half> -// NATIVE_HALF: ret <3 x half> %hlsl.cross -// NO_HALF: define [[FNATTRS]] <3 x float> @ -// NO_HALF: call <3 x float> @llvm.[[TARGET]].cross.v3f32(<3 x float> -// NO_HALF: ret <3 x float> %hlsl.cross -half3 test_cross_half3(half3 p0, half3 p1) -{ - return cross(p0, p1); -} - -// CHECK: define [[FNATTRS]] <3 x float> @ -// CHECK: %hlsl.cross = call <3 x float> @llvm.[[TARGET]].cross.v3f32( -// CHECK: ret <3 x float> %hlsl.cross -float3 test_cross_float3(float3 p0, float3 p1) -{ - return cross(p0, p1); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv + +// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ +// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half> +// NATIVE_HALF: ret <3 x half> %hlsl.cross +// NO_HALF: define [[FNATTRS]] <3 x float> @ +// NO_HALF: call <3 x float> @llvm.[[TARGET]].cross.v3f32(<3 x float> +// NO_HALF: ret <3 x float> %hlsl.cross +half3 test_cross_half3(half3 p0, half3 p1) +{ + return cross(p0, p1); +} + +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %hlsl.cross = call <3 x float> @llvm.[[TARGET]].cross.v3f32( +// CHECK: ret <3 x float> %hlsl.cross +float3 test_cross_float3(float3 p0, float3 p1) +{ + return cross(p0, p1); +} diff --git a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl index e735a85b589f87eae850665b2a557025f7a8c177..6751cf2703ce0e493c5370f29d54a9ab2c2c9a12 100644 --- a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl +++ b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl @@ -1,9 +1,53 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -O1 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -x hlsl -emit-llvm -o - %s | FileCheck %s -void foo(__hlsl_resource_t res); +using handle_float_t = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]]; -// CHECK: define void @_Z3baru17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM:[a-zA-Z0-9]+]]) -// CHECK: call void @_Z3foou17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM]]) -void bar(__hlsl_resource_t a) { - foo(a); +// CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0) +// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", %struct.MyStruct = type { <4 x float>, <2 x i32>, [8 x i8] }, 1, 0) + +// CHECK: define void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a) +// CHECK: call void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %0) +// CHECK: declare void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0)) + +void foo1(handle_float_t res); + +void fa(handle_float_t a) { + foo1(a); +} + +// CHECK: define void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a) +void fb(handle_float_t a) { + handle_float_t b = a; } + +// CHECK: define void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %a) +// CHECK: call void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %agg.tmp) +// CHECK: declare void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 16) +void foo2(RWBuffer buf); + +void fc(RWBuffer a) { + foo2(a); +} + +void fd(RWBuffer a) { + RWBuffer b = a; +} + +struct MyStruct { + float4 f; + int2 i; +}; + +// CHECK: define void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 16 %a) +// CHECK: call void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 16 %agg.tmp) +// CHECK: declare void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 16) +void foo3(StructuredBuffer buf); + +void fe(StructuredBuffer a) { + foo3(a); +} + +void ff(StructuredBuffer a) { + StructuredBuffer b = a; +} + diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl index 1c23b0df04df9873629f696f54a9ed5cc47037a7..9b0293c218a5deed0d5e2be5554d08ac0dd6be90 100644 --- a/clang/test/CodeGenHLSL/builtins/length.hlsl +++ b/clang/test/CodeGenHLSL/builtins/length.hlsl @@ -1,73 +1,73 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF - -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: call half @llvm.fabs.f16(half -// NO_HALF: call float @llvm.fabs.f32(float -// NATIVE_HALF: ret half -// NO_HALF: ret float -half test_length_half(half p0) -{ - return length(p0); -} -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v2f16 -// NO_HALF: %hlsl.length = call float @llvm.dx.length.v2f32( -// NATIVE_HALF: ret half %hlsl.length -// NO_HALF: ret float %hlsl.length -half test_length_half2(half2 p0) -{ - return length(p0); -} -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v3f16 -// NO_HALF: %hlsl.length = call float @llvm.dx.length.v3f32( -// NATIVE_HALF: ret half %hlsl.length -// NO_HALF: ret float %hlsl.length -half test_length_half3(half3 p0) -{ - return length(p0); -} -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v4f16 -// NO_HALF: %hlsl.length = call float @llvm.dx.length.v4f32( -// NATIVE_HALF: ret half %hlsl.length -// NO_HALF: ret float %hlsl.length -half test_length_half4(half4 p0) -{ - return length(p0); -} - -// CHECK: define noundef float @ -// CHECK: call float @llvm.fabs.f32(float -// CHECK: ret float -float test_length_float(float p0) -{ - return length(p0); -} -// CHECK: define noundef float @ -// CHECK: %hlsl.length = call float @llvm.dx.length.v2f32( -// CHECK: ret float %hlsl.length -float test_length_float2(float2 p0) -{ - return length(p0); -} -// CHECK: define noundef float @ -// CHECK: %hlsl.length = call float @llvm.dx.length.v3f32( -// CHECK: ret float %hlsl.length -float test_length_float3(float3 p0) -{ - return length(p0); -} -// CHECK: define noundef float @ -// CHECK: %hlsl.length = call float @llvm.dx.length.v4f32( -// CHECK: ret float %hlsl.length -float test_length_float4(float4 p0) -{ - return length(p0); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF + +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: call half @llvm.fabs.f16(half +// NO_HALF: call float @llvm.fabs.f32(float +// NATIVE_HALF: ret half +// NO_HALF: ret float +half test_length_half(half p0) +{ + return length(p0); +} +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v2f16 +// NO_HALF: %hlsl.length = call float @llvm.dx.length.v2f32( +// NATIVE_HALF: ret half %hlsl.length +// NO_HALF: ret float %hlsl.length +half test_length_half2(half2 p0) +{ + return length(p0); +} +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v3f16 +// NO_HALF: %hlsl.length = call float @llvm.dx.length.v3f32( +// NATIVE_HALF: ret half %hlsl.length +// NO_HALF: ret float %hlsl.length +half test_length_half3(half3 p0) +{ + return length(p0); +} +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v4f16 +// NO_HALF: %hlsl.length = call float @llvm.dx.length.v4f32( +// NATIVE_HALF: ret half %hlsl.length +// NO_HALF: ret float %hlsl.length +half test_length_half4(half4 p0) +{ + return length(p0); +} + +// CHECK: define noundef float @ +// CHECK: call float @llvm.fabs.f32(float +// CHECK: ret float +float test_length_float(float p0) +{ + return length(p0); +} +// CHECK: define noundef float @ +// CHECK: %hlsl.length = call float @llvm.dx.length.v2f32( +// CHECK: ret float %hlsl.length +float test_length_float2(float2 p0) +{ + return length(p0); +} +// CHECK: define noundef float @ +// CHECK: %hlsl.length = call float @llvm.dx.length.v3f32( +// CHECK: ret float %hlsl.length +float test_length_float3(float3 p0) +{ + return length(p0); +} +// CHECK: define noundef float @ +// CHECK: %hlsl.length = call float @llvm.dx.length.v4f32( +// CHECK: ret float %hlsl.length +float test_length_float4(float4 p0) +{ + return length(p0); +} diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl index 83ad607c14a607e0711bdeadef2411eb463b47c8..d14e7c70ce0653428b950a21f706ba1258704db7 100644 --- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl +++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl @@ -1,85 +1,85 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv - -// NATIVE_HALF: define [[FNATTRS]] half @ -// NATIVE_HALF: call half @llvm.[[TARGET]].normalize.f16(half -// NO_HALF: call float @llvm.[[TARGET]].normalize.f32(float -// NATIVE_HALF: ret half -// NO_HALF: ret float -half test_normalize_half(half p0) -{ - return normalize(p0); -} -// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ -// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].normalize.v2f16(<2 x half> -// NO_HALF: call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> -// NATIVE_HALF: ret <2 x half> %hlsl.normalize -// NO_HALF: ret <2 x float> %hlsl.normalize -half2 test_normalize_half2(half2 p0) -{ - return normalize(p0); -} -// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ -// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].normalize.v3f16(<3 x half> -// NO_HALF: call <3 x float> @llvm.[[TARGET]].normalize.v3f32(<3 x float> -// NATIVE_HALF: ret <3 x half> %hlsl.normalize -// NO_HALF: ret <3 x float> %hlsl.normalize -half3 test_normalize_half3(half3 p0) -{ - return normalize(p0); -} -// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ -// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].normalize.v4f16(<4 x half> -// NO_HALF: call <4 x float> @llvm.[[TARGET]].normalize.v4f32(<4 x float> -// NATIVE_HALF: ret <4 x half> %hlsl.normalize -// NO_HALF: ret <4 x float> %hlsl.normalize -half4 test_normalize_half4(half4 p0) -{ - return normalize(p0); -} - -// CHECK: define [[FNATTRS]] float @ -// CHECK: call float @llvm.[[TARGET]].normalize.f32(float -// CHECK: ret float -float test_normalize_float(float p0) -{ - return normalize(p0); -} -// CHECK: define [[FNATTRS]] <2 x float> @ -// CHECK: %hlsl.normalize = call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> - -// CHECK: ret <2 x float> %hlsl.normalize -float2 test_normalize_float2(float2 p0) -{ - return normalize(p0); -} -// CHECK: define [[FNATTRS]] <3 x float> @ -// CHECK: %hlsl.normalize = call <3 x float> @llvm.[[TARGET]].normalize.v3f32( -// CHECK: ret <3 x float> %hlsl.normalize -float3 test_normalize_float3(float3 p0) -{ - return normalize(p0); -} -// CHECK: define [[FNATTRS]] <4 x float> @ -// CHECK: %hlsl.normalize = call <4 x float> @llvm.[[TARGET]].normalize.v4f32( -// CHECK: ret <4 x float> %hlsl.normalize -float4 test_length_float4(float4 p0) -{ - return normalize(p0); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv + +// NATIVE_HALF: define [[FNATTRS]] half @ +// NATIVE_HALF: call half @llvm.[[TARGET]].normalize.f16(half +// NO_HALF: call float @llvm.[[TARGET]].normalize.f32(float +// NATIVE_HALF: ret half +// NO_HALF: ret float +half test_normalize_half(half p0) +{ + return normalize(p0); +} +// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ +// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].normalize.v2f16(<2 x half> +// NO_HALF: call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> +// NATIVE_HALF: ret <2 x half> %hlsl.normalize +// NO_HALF: ret <2 x float> %hlsl.normalize +half2 test_normalize_half2(half2 p0) +{ + return normalize(p0); +} +// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ +// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].normalize.v3f16(<3 x half> +// NO_HALF: call <3 x float> @llvm.[[TARGET]].normalize.v3f32(<3 x float> +// NATIVE_HALF: ret <3 x half> %hlsl.normalize +// NO_HALF: ret <3 x float> %hlsl.normalize +half3 test_normalize_half3(half3 p0) +{ + return normalize(p0); +} +// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ +// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].normalize.v4f16(<4 x half> +// NO_HALF: call <4 x float> @llvm.[[TARGET]].normalize.v4f32(<4 x float> +// NATIVE_HALF: ret <4 x half> %hlsl.normalize +// NO_HALF: ret <4 x float> %hlsl.normalize +half4 test_normalize_half4(half4 p0) +{ + return normalize(p0); +} + +// CHECK: define [[FNATTRS]] float @ +// CHECK: call float @llvm.[[TARGET]].normalize.f32(float +// CHECK: ret float +float test_normalize_float(float p0) +{ + return normalize(p0); +} +// CHECK: define [[FNATTRS]] <2 x float> @ +// CHECK: %hlsl.normalize = call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> + +// CHECK: ret <2 x float> %hlsl.normalize +float2 test_normalize_float2(float2 p0) +{ + return normalize(p0); +} +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %hlsl.normalize = call <3 x float> @llvm.[[TARGET]].normalize.v3f32( +// CHECK: ret <3 x float> %hlsl.normalize +float3 test_normalize_float3(float3 p0) +{ + return normalize(p0); +} +// CHECK: define [[FNATTRS]] <4 x float> @ +// CHECK: %hlsl.normalize = call <4 x float> @llvm.[[TARGET]].normalize.v4f32( +// CHECK: ret <4 x float> %hlsl.normalize +float4 test_length_float4(float4 p0) +{ + return normalize(p0); +} diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl index 0ed9a9468d86c00a07a647f5eee926a79fd86526..1cdefa815b103f7fbf319facbbb9e4dc8dd2bc86 100644 --- a/clang/test/CodeGenHLSL/builtins/sign.hlsl +++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl @@ -202,19 +202,19 @@ int4 test_sign_int64_t4(int64_t4 p0) { return sign(p0); } // CHECK: define [[FNATTRS]] i32 @ // CHECK: [[CMP:%.*]] = icmp eq i64 [[ARG:%.*]], 0 // CHECK: %hlsl.sign = select i1 [[CMP]], i32 0, i32 1 -int test_sign_int64_t(uint64_t p0) { return sign(p0); } +int test_sign_uint64_t(uint64_t p0) { return sign(p0); } // CHECK: define [[FNATTRS]] <2 x i32> @ // CHECK: [[CMP:%.*]] = icmp eq <2 x i64> [[ARG:%.*]], zeroinitializer // CHECK: %hlsl.sign = select <2 x i1> [[CMP]], <2 x i32> zeroinitializer, <2 x i32> -int2 test_sign_int64_t2(uint64_t2 p0) { return sign(p0); } +int2 test_sign_uint64_t2(uint64_t2 p0) { return sign(p0); } // CHECK: define [[FNATTRS]] <3 x i32> @ // CHECK: [[CMP:%.*]] = icmp eq <3 x i64> [[ARG:%.*]], zeroinitializer // CHECK: %hlsl.sign = select <3 x i1> [[CMP]], <3 x i32> zeroinitializer, <3 x i32> -int3 test_sign_int64_t3(uint64_t3 p0) { return sign(p0); } +int3 test_sign_uint64_t3(uint64_t3 p0) { return sign(p0); } // CHECK: define [[FNATTRS]] <4 x i32> @ // CHECK: [[CMP:%.*]] = icmp eq <4 x i64> [[ARG:%.*]], zeroinitializer // CHECK: %hlsl.sign = select <4 x i1> [[CMP]], <4 x i32> zeroinitializer, <4 x i32> -int4 test_sign_int64_t4(uint64_t4 p0) { return sign(p0); } +int4 test_sign_uint64_t4(uint64_t4 p0) { return sign(p0); } diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl index 442f4930ca579cedf64dc691b7643e772fecd20f..8ef52794a3be5daf44055a27d95d9cd09f7f7277 100644 --- a/clang/test/CodeGenHLSL/builtins/step.hlsl +++ b/clang/test/CodeGenHLSL/builtins/step.hlsl @@ -1,84 +1,84 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv - -// NATIVE_HALF: define [[FNATTRS]] half @ -// NATIVE_HALF: call half @llvm.[[TARGET]].step.f16(half -// NO_HALF: call float @llvm.[[TARGET]].step.f32(float -// NATIVE_HALF: ret half -// NO_HALF: ret float -half test_step_half(half p0, half p1) -{ - return step(p0, p1); -} -// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ -// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].step.v2f16(<2 x half> -// NO_HALF: call <2 x float> @llvm.[[TARGET]].step.v2f32(<2 x float> -// NATIVE_HALF: ret <2 x half> %hlsl.step -// NO_HALF: ret <2 x float> %hlsl.step -half2 test_step_half2(half2 p0, half2 p1) -{ - return step(p0, p1); -} -// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ -// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].step.v3f16(<3 x half> -// NO_HALF: call <3 x float> @llvm.[[TARGET]].step.v3f32(<3 x float> -// NATIVE_HALF: ret <3 x half> %hlsl.step -// NO_HALF: ret <3 x float> %hlsl.step -half3 test_step_half3(half3 p0, half3 p1) -{ - return step(p0, p1); -} -// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ -// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].step.v4f16(<4 x half> -// NO_HALF: call <4 x float> @llvm.[[TARGET]].step.v4f32(<4 x float> -// NATIVE_HALF: ret <4 x half> %hlsl.step -// NO_HALF: ret <4 x float> %hlsl.step -half4 test_step_half4(half4 p0, half4 p1) -{ - return step(p0, p1); -} - -// CHECK: define [[FNATTRS]] float @ -// CHECK: call float @llvm.[[TARGET]].step.f32(float -// CHECK: ret float -float test_step_float(float p0, float p1) -{ - return step(p0, p1); -} -// CHECK: define [[FNATTRS]] <2 x float> @ -// CHECK: %hlsl.step = call <2 x float> @llvm.[[TARGET]].step.v2f32( -// CHECK: ret <2 x float> %hlsl.step -float2 test_step_float2(float2 p0, float2 p1) -{ - return step(p0, p1); -} -// CHECK: define [[FNATTRS]] <3 x float> @ -// CHECK: %hlsl.step = call <3 x float> @llvm.[[TARGET]].step.v3f32( -// CHECK: ret <3 x float> %hlsl.step -float3 test_step_float3(float3 p0, float3 p1) -{ - return step(p0, p1); -} -// CHECK: define [[FNATTRS]] <4 x float> @ -// CHECK: %hlsl.step = call <4 x float> @llvm.[[TARGET]].step.v4f32( -// CHECK: ret <4 x float> %hlsl.step -float4 test_step_float4(float4 p0, float4 p1) -{ - return step(p0, p1); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv + +// NATIVE_HALF: define [[FNATTRS]] half @ +// NATIVE_HALF: call half @llvm.[[TARGET]].step.f16(half +// NO_HALF: call float @llvm.[[TARGET]].step.f32(float +// NATIVE_HALF: ret half +// NO_HALF: ret float +half test_step_half(half p0, half p1) +{ + return step(p0, p1); +} +// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ +// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].step.v2f16(<2 x half> +// NO_HALF: call <2 x float> @llvm.[[TARGET]].step.v2f32(<2 x float> +// NATIVE_HALF: ret <2 x half> %hlsl.step +// NO_HALF: ret <2 x float> %hlsl.step +half2 test_step_half2(half2 p0, half2 p1) +{ + return step(p0, p1); +} +// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ +// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].step.v3f16(<3 x half> +// NO_HALF: call <3 x float> @llvm.[[TARGET]].step.v3f32(<3 x float> +// NATIVE_HALF: ret <3 x half> %hlsl.step +// NO_HALF: ret <3 x float> %hlsl.step +half3 test_step_half3(half3 p0, half3 p1) +{ + return step(p0, p1); +} +// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ +// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].step.v4f16(<4 x half> +// NO_HALF: call <4 x float> @llvm.[[TARGET]].step.v4f32(<4 x float> +// NATIVE_HALF: ret <4 x half> %hlsl.step +// NO_HALF: ret <4 x float> %hlsl.step +half4 test_step_half4(half4 p0, half4 p1) +{ + return step(p0, p1); +} + +// CHECK: define [[FNATTRS]] float @ +// CHECK: call float @llvm.[[TARGET]].step.f32(float +// CHECK: ret float +float test_step_float(float p0, float p1) +{ + return step(p0, p1); +} +// CHECK: define [[FNATTRS]] <2 x float> @ +// CHECK: %hlsl.step = call <2 x float> @llvm.[[TARGET]].step.v2f32( +// CHECK: ret <2 x float> %hlsl.step +float2 test_step_float2(float2 p0, float2 p1) +{ + return step(p0, p1); +} +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %hlsl.step = call <3 x float> @llvm.[[TARGET]].step.v3f32( +// CHECK: ret <3 x float> %hlsl.step +float3 test_step_float3(float3 p0, float3 p1) +{ + return step(p0, p1); +} +// CHECK: define [[FNATTRS]] <4 x float> @ +// CHECK: %hlsl.step = call <4 x float> @llvm.[[TARGET]].step.v4f32( +// CHECK: ret <4 x float> %hlsl.step +float4 test_step_float4(float4 p0, float4 p1) +{ + return step(p0, p1); +} diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl index 6b053dc6add1f2643de3c440a31166a9b8f14e93..3ab8048146ad3066d2d6a65fb0794f8084532f00 100644 --- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl +++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl @@ -17,7 +17,7 @@ void main() { // CHECK: br i1 {{%.+}}, label %[[LABEL_IF_THEN:.+]], label %[[LABEL_IF_END:.+]] // CHECK: [[LABEL_IF_THEN]]: -// CHECK: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CT_LOOP]]) ] +// CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CT_LOOP]]) ] // CHECK: br label %[[LABEL_WHILE_END:.+]] if (cond == 2) { uint index = WaveGetLaneIndex(); @@ -33,7 +33,7 @@ void main() { // CHECK: ret void } -// CHECK-DAG: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] +// CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] // CHECK-DAG: attributes [[A0]] = {{{.*}}convergent{{.*}}} // CHECK-DAG: attributes [[A1]] = {{{.*}}convergent{{.*}}} diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl index 06a2715b00e96905e8ce6c02e478cb90aff41a28..8e1f2d69e74329d016b651fc8e4f724e5dc1c2d4 100644 --- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl +++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl @@ -9,13 +9,13 @@ // CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] { // CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] { // CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry() -// CHECK-SPIRV: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ] +// CHECK-SPIRV: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ] // CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex() int test_1() { return WaveGetLaneIndex(); } -// CHECK-SPIRV: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] +// CHECK-SPIRV: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] // CHECK-DXIL: declare i32 @llvm.dx.wave.getlaneindex() [[A1:#[0-9]+]] // CHECK-DAG: attributes [[A0]] = { {{.*}}convergent{{.*}} } diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl index 6ea80d692cd2444608267aafac7413f80f7fcd56..12b120d0c067d0dc7e68e1fa4c74fa44a08f805e 100644 --- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl +++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl @@ -3,12 +3,12 @@ // CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] { // CHECK: %[[C1:[0-9]+]] = call token @llvm.experimental.convergence.entry() -// CHECK: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ] +// CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ] uint test_1() { return WaveGetLaneIndex(); } -// CHECK-DAG: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] +// CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]] // CHECK: define spir_func noundef i32 @_Z6test_2v() [[A0]] { // CHECK: %[[C2:[0-9]+]] = call token @llvm.experimental.convergence.entry() diff --git a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl index 18860c321eb912b376099bdba56bf45100403fea..2fb6defb896f902f8e3ca2745070d1287697ea82 100644 --- a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl +++ b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl @@ -13,7 +13,7 @@ void main() { while (true) { // CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane() -// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane() +// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane() // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ] if (WaveIsFirstLane()) { break; @@ -21,7 +21,7 @@ void main() { } // CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane() -// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane() +// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane() // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ] if (WaveIsFirstLane()) { return; diff --git a/clang/test/CodeGenObjC/aarch64-sve-types.m b/clang/test/CodeGenObjC/aarch64-sve-types.m index eae734fa4d593111b9a3bb06ec1e1c401b6afc3a..a97ce4b5bd39f44eef6acb3659810b35c7c14b1f 100644 --- a/clang/test/CodeGenObjC/aarch64-sve-types.m +++ b/clang/test/CodeGenObjC/aarch64-sve-types.m @@ -31,5 +31,8 @@ const char f64[] = @encode(__SVFloat64_t); // CHECK: error: cannot yet @encode type __SVBfloat16_t const char bf16[] = @encode(__SVBfloat16_t); +// CHECK: error: cannot yet @encode type __SVMfloat8_t +const char mf8[] = @encode(__SVMfloat8_t); + // CHECK: error: cannot yet @encode type __SVBool_t const char b8[] = @encode(__SVBool_t); diff --git a/clang/test/CodeGenObjCXX/rtti.mm b/clang/test/CodeGenObjCXX/rtti.mm index ee3df349af18d68472387b0fda4f0f33522204ed..2fc6f8722f43984c32e0670bbb4892baaab03ad1 100644 --- a/clang/test/CodeGenObjCXX/rtti.mm +++ b/clang/test/CodeGenObjCXX/rtti.mm @@ -4,19 +4,20 @@ namespace std { class type_info; } -// CHECK: @_ZTI1A = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS1A @interface A @end -// CHECK: @_ZTI1B = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv120__si_class_type_infoE{{.*}}@_ZTS1B{{.*}}@_ZTI1A @interface B : A @end // CHECK: @_ZTIP1B = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv119__pointer_type_infoE{{.*}}@_ZTSP1B{{.*}}, i32 0, {{.*}}@_ZTI1B -// CHECK: @_ZTI11objc_object = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS11objc_object +// CHECK: @_ZTI1B = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv120__si_class_type_infoE{{.*}}@_ZTS1B{{.*}}@_ZTI1A +// CHECK: @_ZTI1A = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS1A + // CHECK: @_ZTIP11objc_object = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv119__pointer_type_infoE{{.*}}@_ZTSP11objc_object{{.*}}@_ZTI11objc_object -// CHECK: @_ZTI10objc_class = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS10objc_class +// CHECK: @_ZTI11objc_object = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS11objc_object // CHECK: @_ZTIP10objc_class = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv119__pointer_type_infoE{{.*}}@_ZTSP10objc_class{{.*}}@_ZTI10objc_class +// CHECK: @_ZTI10objc_class = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS10objc_class @protocol P; diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index bab0e21067eeaef5865287dfb91984cacbe77f0d..7377b5bcbc347a817fce41741c2db40ea198628d 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -1,9 +1,10 @@ -// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -enable-var-scope -check-prefixes=ALL,X86 %s -// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s -// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s -// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -enable-var-scope -check-prefixes=SPIR %s -// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s -// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -check-prefixes=X86 %s +// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN %s +// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN20 %s +// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -check-prefixes=SPIR %s +// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -check-prefixes=AMDGCN30-GVAR %s +// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN30 %s typedef int int2 __attribute__((ext_vector_type(2))); @@ -45,147 +46,1236 @@ struct LargeStructTwoMember { struct LargeStructOneMember g_s; #endif -// X86-LABEL: define{{.*}} void @foo(ptr dead_on_unwind noalias writable sret(%struct.Mat4X4) align 4 %agg.result, ptr noundef byval(%struct.Mat3X3) align 4 %in) -// AMDGCN-LABEL: define{{.*}} %struct.Mat4X4 @foo([9 x i32] %in.coerce) +// +// X86-LABEL: define void @foo( +// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// AMDGCN20-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN20-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN20-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// SPIR-LABEL: define dso_local spir_func void @foo( +// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN30-GVAR-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN30-GVAR-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// AMDGCN30-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN30-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN30-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN30-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { Mat4X4 out; return out; } -// ALL-LABEL: define {{.*}} void @ker -// Expect two mem copies: one for the argument "in", and one for -// the return value. -// X86: call void @llvm.memcpy.p0.p1.i32(ptr -// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) - -// AMDGCN: load [9 x i32], ptr addrspace(1) -// AMDGCN: call %struct.Mat4X4 @foo([9 x i32] -// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) +// +// X86-LABEL: define spir_kernel void @ker( +// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 +// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]] +// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN20-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @ker( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) +// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN30-GVAR-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN30-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] +// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-NEXT: ret void +// kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { out[0] = foo(in[1]); } -// X86-LABEL: define{{.*}} void @foo_large(ptr dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr noundef byval(%struct.Mat32X32) align 4 %in) -// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byref(%struct.Mat32X32) align 4 %{{.*}} -// AMDGCN: %in = alloca %struct.Mat32X32, align 4, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 %in, ptr addrspace(5) align 4 %{{.*}}, i64 4096, i1 false) +// +// X86-LABEL: define void @foo_large( +// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local void @foo_large( +// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local void @foo_large( +// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_func void @foo_large( +// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local void @foo_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local void @foo_large( +// AMDGCN30-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN30-NEXT: ret void +// Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { Mat64X64 out; return out; } -// ALL-LABEL: define {{.*}} void @ker_large -// Expect two mem copies: one for the argument "in", and one for -// the return value. -// X86: call void @llvm.memcpy.p0.p1.i32(ptr -// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) -// AMDGCN: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) -// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) +// +// X86-LABEL: define spir_kernel void @ker_large( +// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 +// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @ker_large( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) +// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN30-NEXT: ret void +// kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { out[0] = foo_large(in[1]); } -// AMDGCN-LABEL: define{{.*}} void @FuncOneMember(<2 x i32> %u.coerce) +// +// X86-LABEL: define void @FuncOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8 +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local void @FuncOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local void @FuncOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_func void @FuncOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local void @FuncOneMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN30-NEXT: ret void +// void FuncOneMember(struct StructOneMember u) { u.x = (int2)(0, 0); } -// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %{{.*}} -// AMDGCN: %u = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %u, ptr addrspace(5) align 8 %{{.*}}, i64 800, i1 false) -// AMDGCN-NOT: addrspacecast -// AMDGCN: store <2 x i32> %{{.*}}, ptr addrspace(5) +// +// X86-LABEL: define void @FuncOneLargeMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-NEXT: ret void +// void FuncOneLargeMember(struct LargeStructOneMember u) { u.x[0] = (int2)(0, 0); } -// AMDGCN20-LABEL: define{{.*}} void @test_indirect_arg_globl() -// AMDGCN20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN20: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN20: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]]) #if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) +// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl( +// AMDGCN20-SAME: ) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl( +// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// void test_indirect_arg_globl(void) { FuncOneLargeMember(g_s); } #endif -// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @test_indirect_arg_local() -// AMDGCN: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]]) +// +// X86-LABEL: define spir_kernel void @test_indirect_arg_local( +// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local( +// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN30-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-NEXT: ret void +// kernel void test_indirect_arg_local(void) { local struct LargeStructOneMember l_s; FuncOneLargeMember(l_s); } -// AMDGCN-LABEL: define{{.*}} void @test_indirect_arg_private() -// AMDGCN: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN-NOT: @llvm.memcpy -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[p_s]]) +// +// X86-LABEL: define void @test_indirect_arg_private( +// X86-SAME: ) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN-SAME: ) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN20-SAME: ) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private( +// SPIR-SAME: ) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN30-SAME: ) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN30-NEXT: ret void +// void test_indirect_arg_private(void) { struct LargeStructOneMember p_s; FuncOneLargeMember(p_s); } -// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelOneMember -// AMDGCN-SAME: (<2 x i32> %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.StructOneMember, align 8, addrspace(5) -// AMDGCN: %[[coerce_dive:.*]] = getelementptr inbounds nuw %struct.StructOneMember, ptr addrspace(5) %[[u]], i32 0, i32 0 -// AMDGCN: store <2 x i32> %[[u_coerce]], ptr addrspace(5) %[[coerce_dive]] -// AMDGCN: call void @FuncOneMember(<2 x i32> +// +// X86-LABEL: define spir_kernel void @KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN30-NEXT: ret void +// kernel void KernelOneMember(struct StructOneMember u) { FuncOneMember(u); } -// SPIR: call void @llvm.memcpy.p0.p1.i32 -// SPIR-NOT: addrspacecast +// +// X86-LABEL: define spir_kernel void @KernelOneMemberSpir( +// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4 +// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir( +// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN30-NEXT: ret void +// kernel void KernelOneMemberSpir(global struct StructOneMember* u) { FuncOneMember(*u); } -// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN: %[[U_ELEM:.*]] = getelementptr inbounds nuw %struct.LargeStructOneMember, ptr addrspace(5) %[[U]], i32 0, i32 0 -// AMDGCN: %[[EXTRACT:.*]] = extractvalue %struct.LargeStructOneMember %u.coerce, 0 -// AMDGCN: store [100 x <2 x i32>] %[[EXTRACT]], ptr addrspace(5) %[[U_ELEM]], align 8 -// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]]) +// +// X86-LABEL: define spir_kernel void @KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-NEXT: ret void +// kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); } -// AMDGCN-LABEL: define{{.*}} void @FuncTwoMember(<2 x i32> %u.coerce0, <2 x i32> %u.coerce1) +// +// X86-LABEL: define void @FuncTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[Y]], align 8 +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_func void @FuncTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[Y]], align 8 +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN30-NEXT: ret void +// void FuncTwoMember(struct StructTwoMember u) { u.y = (int2)(0, 0); } -// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember -// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) -// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) -// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %[[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// +// X86-LABEL: define void @FuncLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-NEXT: ret void +// void FuncLargeTwoMember(struct LargeStructTwoMember u) { u.y[0] = (int2)(0, 0); } -// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelTwoMember -// AMDGCN-SAME: (%struct.StructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5) -// AMDGCN: %[[LD0:.*]] = load <2 x i32>, ptr addrspace(5) -// AMDGCN: %[[LD1:.*]] = load <2 x i32>, ptr addrspace(5) -// AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]]) +// +// X86-LABEL: define spir_kernel void @KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN20-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN30-NEXT: ret void +// kernel void KernelTwoMember(struct StructTwoMember u) { FuncTwoMember(u); } -// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeTwoMember -// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) -// AMDGCN: %[[U_PTR0:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 0 -// AMDGCN: %[[EXTRACT0:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 0 -// AMDGCN: store [40 x <2 x i32>] %[[EXTRACT0]], ptr addrspace(5) %[[U_PTR0]] -// AMDGCN: %[[U_PTR1:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 1 -// AMDGCN: %[[EXTRACT1:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 1 -// AMDGCN: store [20 x <2 x i32>] %[[EXTRACT1]], ptr addrspace(5) %[[U_PTR1]] -// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]]) +// +// X86-LABEL: define spir_kernel void @KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN20-NEXT: ret void +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-GVAR-NEXT: ret void +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-NEXT: ret void +// kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); } +//. +// X86: [[META4]] = !{i32 1, i32 1} +// X86: [[META5]] = !{!"none", !"none"} +// X86: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"} +// X86: [[META7]] = !{!"", !""} +// X86: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"} +// X86: [[META9]] = !{} +// X86: [[META10]] = !{i32 0} +// X86: [[META11]] = !{!"none"} +// X86: [[META12]] = !{!"struct StructOneMember"} +// X86: [[META13]] = !{!""} +// X86: [[META14]] = !{i32 1} +// X86: [[META15]] = !{!"struct StructOneMember*"} +// X86: [[META16]] = !{!"struct LargeStructOneMember"} +// X86: [[META17]] = !{!"struct StructTwoMember"} +// X86: [[META18]] = !{!"struct LargeStructTwoMember"} +//. +// AMDGCN: [[META4]] = !{i32 1, i32 1} +// AMDGCN: [[META5]] = !{!"none", !"none"} +// AMDGCN: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"} +// AMDGCN: [[META7]] = !{!"", !""} +// AMDGCN: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"} +// AMDGCN: [[META9]] = !{} +// AMDGCN: [[META10]] = !{i32 0} +// AMDGCN: [[META11]] = !{!"none"} +// AMDGCN: [[META12]] = !{!"struct StructOneMember"} +// AMDGCN: [[META13]] = !{!""} +// AMDGCN: [[META14]] = !{i32 1} +// AMDGCN: [[META15]] = !{!"struct StructOneMember*"} +// AMDGCN: [[META16]] = !{!"struct LargeStructOneMember"} +// AMDGCN: [[META17]] = !{!"struct StructTwoMember"} +// AMDGCN: [[META18]] = !{!"struct LargeStructTwoMember"} +//. +// AMDGCN20: [[META4]] = !{i32 1, i32 1} +// AMDGCN20: [[META5]] = !{!"none", !"none"} +// AMDGCN20: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"} +// AMDGCN20: [[META7]] = !{!"", !""} +// AMDGCN20: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"} +// AMDGCN20: [[META9]] = !{} +// AMDGCN20: [[META10]] = !{i32 0} +// AMDGCN20: [[META11]] = !{!"none"} +// AMDGCN20: [[META12]] = !{!"struct StructOneMember"} +// AMDGCN20: [[META13]] = !{!""} +// AMDGCN20: [[META14]] = !{i32 1} +// AMDGCN20: [[META15]] = !{!"struct StructOneMember*"} +// AMDGCN20: [[META16]] = !{!"struct LargeStructOneMember"} +// AMDGCN20: [[META17]] = !{!"struct StructTwoMember"} +// AMDGCN20: [[META18]] = !{!"struct LargeStructTwoMember"} +//. +// SPIR: [[META3]] = !{i32 1, i32 1} +// SPIR: [[META4]] = !{!"none", !"none"} +// SPIR: [[META5]] = !{!"Mat3X3*", !"Mat4X4*"} +// SPIR: [[META6]] = !{!"", !""} +// SPIR: [[META7]] = !{!"Mat32X32*", !"Mat64X64*"} +// SPIR: [[META8]] = !{} +// SPIR: [[META9]] = !{i32 0} +// SPIR: [[META10]] = !{!"none"} +// SPIR: [[META11]] = !{!"struct StructOneMember"} +// SPIR: [[META12]] = !{!""} +// SPIR: [[META13]] = !{i32 1} +// SPIR: [[META14]] = !{!"struct StructOneMember*"} +// SPIR: [[META15]] = !{!"struct LargeStructOneMember"} +// SPIR: [[META16]] = !{!"struct StructTwoMember"} +// SPIR: [[META17]] = !{!"struct LargeStructTwoMember"} +//. +// AMDGCN30-GVAR: [[META4]] = !{i32 1, i32 1} +// AMDGCN30-GVAR: [[META5]] = !{!"none", !"none"} +// AMDGCN30-GVAR: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"} +// AMDGCN30-GVAR: [[META7]] = !{!"", !""} +// AMDGCN30-GVAR: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"} +// AMDGCN30-GVAR: [[META9]] = !{} +// AMDGCN30-GVAR: [[META10]] = !{i32 0} +// AMDGCN30-GVAR: [[META11]] = !{!"none"} +// AMDGCN30-GVAR: [[META12]] = !{!"struct StructOneMember"} +// AMDGCN30-GVAR: [[META13]] = !{!""} +// AMDGCN30-GVAR: [[META14]] = !{i32 1} +// AMDGCN30-GVAR: [[META15]] = !{!"struct StructOneMember*"} +// AMDGCN30-GVAR: [[META16]] = !{!"struct LargeStructOneMember"} +// AMDGCN30-GVAR: [[META17]] = !{!"struct StructTwoMember"} +// AMDGCN30-GVAR: [[META18]] = !{!"struct LargeStructTwoMember"} +//. +// AMDGCN30: [[META4]] = !{i32 1, i32 1} +// AMDGCN30: [[META5]] = !{!"none", !"none"} +// AMDGCN30: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"} +// AMDGCN30: [[META7]] = !{!"", !""} +// AMDGCN30: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"} +// AMDGCN30: [[META9]] = !{} +// AMDGCN30: [[META10]] = !{i32 0} +// AMDGCN30: [[META11]] = !{!"none"} +// AMDGCN30: [[META12]] = !{!"struct StructOneMember"} +// AMDGCN30: [[META13]] = !{!""} +// AMDGCN30: [[META14]] = !{i32 1} +// AMDGCN30: [[META15]] = !{!"struct StructOneMember*"} +// AMDGCN30: [[META16]] = !{!"struct LargeStructOneMember"} +// AMDGCN30: [[META17]] = !{!"struct StructTwoMember"} +// AMDGCN30: [[META18]] = !{!"struct LargeStructTwoMember"} +//. diff --git a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl index f26495bc44aa3a731e1b918c5b34ba1fe1347e9a..c847f5850b2231aa3487539f64b0f61a9513a61f 100644 --- a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl +++ b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl @@ -1,67 +1,107 @@ -// RUN: %clang_cc1 -O0 -cl-std=CL1.2 -triple amdgcn---amdgizcl -emit-llvm %s -o - | FileCheck -check-prefixes=CHECK,CL12 %s -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn---amdgizcl -emit-llvm %s -o - | FileCheck -check-prefixes=CHECK,CL20 %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -O0 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck -check-prefixes=CL12 %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck -check-prefixes=CL20 %s -// CL12-LABEL: define{{.*}} void @func1(ptr addrspace(5) noundef %x) -// CL20-LABEL: define{{.*}} void @func1(ptr noundef %x) +// CL12-LABEL: define dso_local void @func1( +// CL12-SAME: ptr addrspace(5) noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CL12-NEXT: [[ENTRY:.*:]] +// CL12-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +// CL12-NEXT: store ptr addrspace(5) [[X]], ptr addrspace(5) [[X_ADDR]], align 4 +// CL12-NEXT: [[TMP0:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[X_ADDR]], align 4 +// CL12-NEXT: store i32 1, ptr addrspace(5) [[TMP0]], align 4 +// CL12-NEXT: ret void +// +// CL20-LABEL: define dso_local void @func1( +// CL20-SAME: ptr noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CL20-NEXT: [[ENTRY:.*:]] +// CL20-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CL20-NEXT: store ptr [[X]], ptr addrspace(5) [[X_ADDR]], align 8 +// CL20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[X_ADDR]], align 8 +// CL20-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CL20-NEXT: ret void +// void func1(int *x) { - // CL12: %[[x_addr:.*]] = alloca ptr addrspace(5){{.*}}addrspace(5) - // CL12: store ptr addrspace(5) %x, ptr addrspace(5) %[[x_addr]] - // CL12: %[[r0:.*]] = load ptr addrspace(5), ptr addrspace(5) %[[x_addr]] - // CL12: store i32 1, ptr addrspace(5) %[[r0]] - // CL20: %[[x_addr:.*]] = alloca ptr{{.*}}addrspace(5) - // CL20: store ptr %x, ptr addrspace(5) %[[x_addr]] - // CL20: %[[r0:.*]] = load ptr, ptr addrspace(5) %[[x_addr]] - // CL20: store i32 1, ptr %[[r0]] *x = 1; } -// CHECK-LABEL: define{{.*}} void @func2() +// CL12-LABEL: define dso_local void @func2( +// CL12-SAME: ) #[[ATTR0]] { +// CL12-NEXT: [[ENTRY:.*:]] +// CL12-NEXT: [[LV1:%.*]] = alloca i32, align 4, addrspace(5) +// CL12-NEXT: [[LV2:%.*]] = alloca i32, align 4, addrspace(5) +// CL12-NEXT: [[LA:%.*]] = alloca [100 x i32], align 4, addrspace(5) +// CL12-NEXT: [[LP1:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +// CL12-NEXT: [[LP2:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +// CL12-NEXT: [[LVC:%.*]] = alloca i32, align 4, addrspace(5) +// CL12-NEXT: store i32 1, ptr addrspace(5) [[LV1]], align 4 +// CL12-NEXT: store i32 2, ptr addrspace(5) [[LV2]], align 4 +// CL12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0 +// CL12-NEXT: store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4 +// CL12-NEXT: store ptr addrspace(5) [[LV1]], ptr addrspace(5) [[LP1]], align 4 +// CL12-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0 +// CL12-NEXT: store ptr addrspace(5) [[ARRAYDECAY]], ptr addrspace(5) [[LP2]], align 4 +// CL12-NEXT: call void @func1(ptr addrspace(5) noundef [[LV1]]) #[[ATTR2:[0-9]+]] +// CL12-NEXT: store i32 4, ptr addrspace(5) [[LVC]], align 4 +// CL12-NEXT: store i32 4, ptr addrspace(5) [[LV1]], align 4 +// CL12-NEXT: ret void +// +// CL20-LABEL: define dso_local void @func2( +// CL20-SAME: ) #[[ATTR0]] { +// CL20-NEXT: [[ENTRY:.*:]] +// CL20-NEXT: [[LV1:%.*]] = alloca i32, align 4, addrspace(5) +// CL20-NEXT: [[LV2:%.*]] = alloca i32, align 4, addrspace(5) +// CL20-NEXT: [[LA:%.*]] = alloca [100 x i32], align 4, addrspace(5) +// CL20-NEXT: [[LP1:%.*]] = alloca ptr, align 8, addrspace(5) +// CL20-NEXT: [[LP2:%.*]] = alloca ptr, align 8, addrspace(5) +// CL20-NEXT: [[LVC:%.*]] = alloca i32, align 4, addrspace(5) +// CL20-NEXT: store i32 1, ptr addrspace(5) [[LV1]], align 4 +// CL20-NEXT: store i32 2, ptr addrspace(5) [[LV2]], align 4 +// CL20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0 +// CL20-NEXT: store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4 +// CL20-NEXT: [[LV1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr +// CL20-NEXT: store ptr [[LV1_ASCAST]], ptr addrspace(5) [[LP1]], align 8 +// CL20-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0 +// CL20-NEXT: [[ARRAYDECAY_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr +// CL20-NEXT: store ptr [[ARRAYDECAY_ASCAST]], ptr addrspace(5) [[LP2]], align 8 +// CL20-NEXT: [[LV1_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr +// CL20-NEXT: call void @func1(ptr noundef [[LV1_ASCAST1]]) #[[ATTR2:[0-9]+]] +// CL20-NEXT: store i32 4, ptr addrspace(5) [[LVC]], align 4 +// CL20-NEXT: store i32 4, ptr addrspace(5) [[LV1]], align 4 +// CL20-NEXT: ret void +// void func2(void) { - // CHECK: %lv1 = alloca i32, align 4, addrspace(5) - // CHECK: %lv2 = alloca i32, align 4, addrspace(5) - // CHECK: %la = alloca [100 x i32], align 4, addrspace(5) - // CL12: %lp1 = alloca ptr addrspace(5), align 4, addrspace(5) - // CL12: %lp2 = alloca ptr addrspace(5), align 4, addrspace(5) - // CL20: %lp1 = alloca ptr, align 8, addrspace(5) - // CL20: %lp2 = alloca ptr, align 8, addrspace(5) - // CHECK: %lvc = alloca i32, align 4, addrspace(5) - - // CHECK: store i32 1, ptr addrspace(5) %lv1 int lv1; lv1 = 1; - // CHECK: store i32 2, ptr addrspace(5) %lv2 + int lv2 = 2; - // CHECK: %[[arrayidx:.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) %la, i64 0, i64 0 - // CHECK: store i32 3, ptr addrspace(5) %[[arrayidx]], align 4 int la[100]; la[0] = 3; - // CL12: store ptr addrspace(5) %lv1, ptr addrspace(5) %lp1, align 4 - // CL20: %[[r0:.*]] = addrspacecast ptr addrspace(5) %lv1 to ptr - // CL20: store ptr %[[r0]], ptr addrspace(5) %lp1, align 8 int *lp1 = &lv1; - // CHECK: %[[arraydecay:.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) %la, i64 0, i64 0 - // CL12: store ptr addrspace(5) %[[arraydecay]], ptr addrspace(5) %lp2, align 4 - // CL20: %[[r1:.*]] = addrspacecast ptr addrspace(5) %[[arraydecay]] to ptr - // CL20: store ptr %[[r1]], ptr addrspace(5) %lp2, align 8 int *lp2 = la; - // CL12: call void @func1(ptr addrspace(5) noundef %lv1) - // CL20: %[[r2:.*]] = addrspacecast ptr addrspace(5) %lv1 to ptr - // CL20: call void @func1(ptr noundef %[[r2]]) func1(&lv1); - // CHECK: store i32 4, ptr addrspace(5) %lvc - // CHECK: store i32 4, ptr addrspace(5) %lv1 const int lvc = 4; lv1 = lvc; } -// CHECK-LABEL: define{{.*}} void @func3() -// CHECK: %a = alloca [16 x [1 x float]], align 4, addrspace(5) -// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 %a, i8 0, i64 64, i1 false) +// CL12-LABEL: define dso_local void @func3( +// CL12-SAME: ) #[[ATTR0]] { +// CL12-NEXT: [[ENTRY:.*:]] +// CL12-NEXT: [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5) +// CL12-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false) +// CL12-NEXT: ret void +// +// CL20-LABEL: define dso_local void @func3( +// CL20-SAME: ) #[[ATTR0]] { +// CL20-NEXT: [[ENTRY:.*:]] +// CL20-NEXT: [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5) +// CL20-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false) +// CL20-NEXT: ret void +// void func3(void) { float a[16][1] = {{0.}}; } diff --git a/clang/test/CodeGenOpenCL/atomic-ops.cl b/clang/test/CodeGenOpenCL/atomic-ops.cl index 5e2de38ac3d3e3efe5cb54dad7297668e544fbef..9c1775cc04303cb1ba3c77ac87783f8a12a486ee 100644 --- a/clang/test/CodeGenOpenCL/atomic-ops.cl +++ b/clang/test/CodeGenOpenCL/atomic-ops.cl @@ -37,58 +37,58 @@ atomic_int j; void fi1(atomic_int *i) { // CHECK-LABEL: @fi1 - // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}} int x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group); - // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("agent") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("agent") seq_cst, align 4{{$}} x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_device); - // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} seq_cst, align 4{{$}} x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_all_svm_devices); - // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("wavefront") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("wavefront") seq_cst, align 4{{$}} x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_sub_group); } void fi2(atomic_int *i) { // CHECK-LABEL: @fi2 - // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}} __opencl_atomic_store(i, 1, memory_order_seq_cst, memory_scope_work_group); } void test_addr(global atomic_int *ig, private atomic_int *ip, local atomic_int *il) { // CHECK-LABEL: @test_addr - // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(1) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(1) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}} __opencl_atomic_store(ig, 1, memory_order_seq_cst, memory_scope_work_group); - // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(5) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(5) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}} __opencl_atomic_store(ip, 1, memory_order_seq_cst, memory_scope_work_group); - // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(3) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(3) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}} __opencl_atomic_store(il, 1, memory_order_seq_cst, memory_scope_work_group); } void fi3(atomic_int *i, atomic_uint *ui) { // CHECK-LABEL: @fi3 - // CHECK: atomicrmw and ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: atomicrmw and ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE:![0-9]+]]{{$}} int x = __opencl_atomic_fetch_and(i, 1, memory_order_seq_cst, memory_scope_work_group); - // CHECK: atomicrmw min ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: atomicrmw min ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} x = __opencl_atomic_fetch_min(i, 1, memory_order_seq_cst, memory_scope_work_group); - // CHECK: atomicrmw max ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: atomicrmw max ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} x = __opencl_atomic_fetch_max(i, 1, memory_order_seq_cst, memory_scope_work_group); - // CHECK: atomicrmw umin ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: atomicrmw umin ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} x = __opencl_atomic_fetch_min(ui, 1, memory_order_seq_cst, memory_scope_work_group); - // CHECK: atomicrmw umax ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4 + // CHECK: atomicrmw umax ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} x = __opencl_atomic_fetch_max(ui, 1, memory_order_seq_cst, memory_scope_work_group); } bool fi4(atomic_int *i) { // CHECK-LABEL: @fi4( - // CHECK: [[PAIR:%[.0-9A-Z_a-z]+]] = cmpxchg ptr [[PTR:%[.0-9A-Z_a-z]+]], i32 [[EXPECTED:%[.0-9A-Z_a-z]+]], i32 [[DESIRED:%[.0-9A-Z_a-z]+]] syncscope("workgroup-one-as") acquire acquire, align 4 + // CHECK: [[PAIR:%[.0-9A-Z_a-z]+]] = cmpxchg ptr [[PTR:%[.0-9A-Z_a-z]+]], i32 [[EXPECTED:%[.0-9A-Z_a-z]+]], i32 [[DESIRED:%[.0-9A-Z_a-z]+]] syncscope("workgroup-one-as") acquire acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: [[OLD:%[.0-9A-Z_a-z]+]] = extractvalue { i32, i1 } [[PAIR]], 0 // CHECK: [[CMP:%[.0-9A-Z_a-z]+]] = extractvalue { i32, i1 } [[PAIR]], 1 // CHECK: br i1 [[CMP]], label %[[STORE_EXPECTED:[.0-9A-Z_a-z]+]], label %[[CONTINUE:[.0-9A-Z_a-z]+]] @@ -105,16 +105,16 @@ void fi5(atomic_int *i, int scope) { // CHECK-NEXT: i32 4, label %[[opencl_subgroup:.*]] // CHECK-NEXT: ] // CHECK: [[opencl_workgroup]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4{{$}} // CHECK: br label %[[continue:.*]] // CHECK: [[opencl_device]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4{{$}} // CHECK: br label %[[continue]] // CHECK: [[opencl_allsvmdevices]]: // CHECK: load atomic i32, ptr %{{.*}} seq_cst, align 4 // CHECK: br label %[[continue]] // CHECK: [[opencl_subgroup]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4{{$}} // CHECK: br label %[[continue]] // CHECK: [[continue]]: int x = __opencl_atomic_load(i, memory_order_seq_cst, scope); @@ -146,35 +146,35 @@ void fi6(atomic_int *i, int order, int scope) { // CHECK-NEXT: i32 4, label %[[SEQ_SUB:.*]] // CHECK-NEXT: ] // CHECK: [[MON_WG]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}} // CHECK: [[MON_DEV]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4{{$}} // CHECK: [[MON_ALL]]: - // CHECK: load atomic i32, ptr %{{.*}} monotonic, align 4 + // CHECK: load atomic i32, ptr %{{.*}} monotonic, align 4{{$}} // CHECK: [[MON_SUB]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4{{$}} // CHECK: [[ACQ_WG]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") acquire, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") acquire, align 4{{$}} // CHECK: [[ACQ_DEV]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") acquire, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") acquire, align 4{{$}} // CHECK: [[ACQ_ALL]]: - // CHECK: load atomic i32, ptr %{{.*}} acquire, align 4 + // CHECK: load atomic i32, ptr %{{.*}} acquire, align 4{{$}} // CHECK: [[ACQ_SUB]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") acquire, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") acquire, align 4{{$}} // CHECK: [[SEQ_WG]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4{{$}} // CHECK: [[SEQ_DEV]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4{{$}} // CHECK: [[SEQ_ALL]]: - // CHECK: load atomic i32, ptr %{{.*}} seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{.*}} seq_cst, align 4{{$}} // CHECK: [[SEQ_SUB]]: - // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4 + // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4{{$}} int x = __opencl_atomic_load(i, order, scope); } float ff1(global atomic_float *d) { // CHECK-LABEL: @ff1 - // CHECK: load atomic i32, ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 4 + // CHECK: load atomic i32, ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}} return __opencl_atomic_load(d, memory_order_relaxed, memory_scope_work_group); } @@ -186,19 +186,31 @@ void ff2(atomic_float *d) { float ff3(atomic_float *d) { // CHECK-LABEL: @ff3 - // CHECK: atomicrmw xchg ptr {{.*}} syncscope("workgroup") seq_cst, align 4 + // CHECK: atomicrmw xchg ptr {{.*}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} return __opencl_atomic_exchange(d, 2, memory_order_seq_cst, memory_scope_work_group); } float ff4(global atomic_float *d, float a) { // CHECK-LABEL: @ff4 - // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic + // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}} return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); } float ff5(global atomic_double *d, double a) { // CHECK-LABEL: @ff5 - // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic + // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}} + return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); +} + +float ff4_generic(atomic_float *d, float a) { + // CHECK-LABEL: @ff4_generic + // CHECK: atomicrmw fadd ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} + return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); +} + +float ff5_generic(atomic_double *d, double a) { + // CHECK-LABEL: @ff5_generic + // CHECK: atomicrmw fadd ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace [[$NOPRIVATE]]{{$}} return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); } @@ -215,10 +227,10 @@ void atomic_init_foo() // CHECK-LABEL: @failureOrder void failureOrder(atomic_int *ptr, int *ptr2) { - // CHECK: cmpxchg ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup-one-as") acquire monotonic, align 4 + // CHECK: cmpxchg ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup-one-as") acquire monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} __opencl_atomic_compare_exchange_strong(ptr, ptr2, 43, memory_order_acquire, memory_order_relaxed, memory_scope_work_group); - // CHECK: cmpxchg weak ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup") seq_cst acquire, align 4 + // CHECK: cmpxchg weak ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup") seq_cst acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} __opencl_atomic_compare_exchange_weak(ptr, ptr2, 43, memory_order_seq_cst, memory_order_acquire, memory_scope_work_group); } @@ -268,63 +280,63 @@ void generalFailureOrder(atomic_int *ptr, int *ptr2, int success, int fail) { // CHECK-NEXT: ] // CHECK: [[MONOTONIC_MONOTONIC]] - // CHECK: cmpxchg {{.*}} monotonic monotonic, align 4 + // CHECK: cmpxchg {{.*}} monotonic monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[MONOTONIC_ACQUIRE]] - // CHECK: cmpxchg {{.*}} monotonic acquire, align 4 + // CHECK: cmpxchg {{.*}} monotonic acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[MONOTONIC_SEQCST]] - // CHECK: cmpxchg {{.*}} monotonic seq_cst, align 4 + // CHECK: cmpxchg {{.*}} monotonic seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[ACQUIRE_MONOTONIC]] - // CHECK: cmpxchg {{.*}} acquire monotonic, align 4 + // CHECK: cmpxchg {{.*}} acquire monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[ACQUIRE_ACQUIRE]] - // CHECK: cmpxchg {{.*}} acquire acquire, align 4 + // CHECK: cmpxchg {{.*}} acquire acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[ACQUIRE_SEQCST]] - // CHECK: cmpxchg {{.*}} acquire seq_cst, align 4 + // CHECK: cmpxchg {{.*}} acquire seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[RELEASE_MONOTONIC]] - // CHECK: cmpxchg {{.*}} release monotonic, align 4 + // CHECK: cmpxchg {{.*}} release monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[RELEASE_ACQUIRE]] - // CHECK: cmpxchg {{.*}} release acquire, align 4 + // CHECK: cmpxchg {{.*}} release acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[RELEASE_SEQCST]] - // CHECK: cmpxchg {{.*}} release seq_cst, align 4 + // CHECK: cmpxchg {{.*}} release seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[ACQREL_MONOTONIC]] - // CHECK: cmpxchg {{.*}} acq_rel monotonic, align 4 + // CHECK: cmpxchg {{.*}} acq_rel monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[ACQREL_ACQUIRE]] - // CHECK: cmpxchg {{.*}} acq_rel acquire, align 4 + // CHECK: cmpxchg {{.*}} acq_rel acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[ACQREL_SEQCST]] - // CHECK: cmpxchg {{.*}} acq_rel seq_cst, align 4 + // CHECK: cmpxchg {{.*}} acq_rel seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[SEQCST_MONOTONIC]] - // CHECK: cmpxchg {{.*}} seq_cst monotonic, align 4 + // CHECK: cmpxchg {{.*}} seq_cst monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[SEQCST_ACQUIRE]] - // CHECK: cmpxchg {{.*}} seq_cst acquire, align 4 + // CHECK: cmpxchg {{.*}} seq_cst acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br // CHECK: [[SEQCST_SEQCST]] - // CHECK: cmpxchg {{.*}} seq_cst seq_cst, align 4 + // CHECK: cmpxchg {{.*}} seq_cst seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}} // CHECK: br } @@ -334,7 +346,7 @@ int test_volatile(volatile atomic_int *i) { // CHECK-NEXT: %[[atomicdst:.*]] = alloca i32 // CHECK-NEXT: store ptr %i, ptr addrspace(5) %[[i_addr]] // CHECK-NEXT: %[[addr:.*]] = load ptr, ptr addrspace(5) %[[i_addr]] - // CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4 + // CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4{{$}} // CHECK-NEXT: store i32 %[[res]], ptr addrspace(5) %[[atomicdst]] // CHECK-NEXT: %[[retval:.*]] = load i32, ptr addrspace(5) %[[atomicdst]] // CHECK-NEXT: ret i32 %[[retval]] @@ -342,3 +354,5 @@ int test_volatile(volatile atomic_int *i) { } #endif + +// CHECK: [[$NOPRIVATE]] = !{i32 5, i32 6} diff --git a/clang/test/CodeGenOpenCL/pipe_builtin.cl b/clang/test/CodeGenOpenCL/pipe_builtin.cl index c59f63bab6a4589132eea1ff892006db66e892df..ec9d7cb04506410ad2f917fa8850c88a796da644 100644 --- a/clang/test/CodeGenOpenCL/pipe_builtin.cl +++ b/clang/test/CodeGenOpenCL/pipe_builtin.cl @@ -1,3 +1,4 @@ +// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck --check-prefix=CHECK-SPIR %s // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck %s // FIXME: Add MS ABI manglings of OpenCL things and remove %itanium_abi_triple // above to support OpenCL in the MS C++ ABI. @@ -5,65 +6,85 @@ #pragma OPENCL EXTENSION cl_khr_subgroups : enable void test1(read_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__read_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) read_pipe(p, ptr); + // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4) // CHECK: call ptr @__reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4) reserve_id_t rid = reserve_read_pipe(p, 2); + // CHECK-SPIR: call spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__read_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4) read_pipe(p, rid, 2, ptr); + // CHECK-SPIR: call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4) // CHECK: call void @__commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) commit_read_pipe(p, rid); } void test2(write_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__write_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) write_pipe(p, ptr); + // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4) // CHECK: call ptr @__reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4) reserve_id_t rid = reserve_write_pipe(p, 2); + // CHECK-SPIR: call spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__write_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4) write_pipe(p, rid, 2, ptr); + // CHECK-SPIR: call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4) // CHECK: call void @__commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) commit_write_pipe(p, rid); } void test3(read_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4) // CHECK: call ptr @__work_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4) reserve_id_t rid = work_group_reserve_read_pipe(p, 2); + // CHECK-SPIR: call spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4) // CHECK: call void @__work_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) work_group_commit_read_pipe(p, rid); } void test4(write_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4) // CHECK: call ptr @__work_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4) reserve_id_t rid = work_group_reserve_write_pipe(p, 2); + // CHECK-SPIR: call spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4) // CHECK: call void @__work_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) work_group_commit_write_pipe(p, rid); } void test5(read_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4) // CHECK: call ptr @__sub_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4) reserve_id_t rid = sub_group_reserve_read_pipe(p, 2); + // CHECK-SPIR: call spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4) // CHECK: call void @__sub_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) sub_group_commit_read_pipe(p, rid); } void test6(write_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4) // CHECK: call ptr @__sub_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4) reserve_id_t rid = sub_group_reserve_write_pipe(p, 2); + // CHECK-SPIR: call spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4) // CHECK: call void @__sub_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4) sub_group_commit_write_pipe(p, rid); } void test7(read_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__get_pipe_num_packets_ro(ptr %{{.*}}, i32 4, i32 4) *ptr = get_pipe_num_packets(p); + // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__get_pipe_max_packets_ro(ptr %{{.*}}, i32 4, i32 4) *ptr = get_pipe_max_packets(p); } void test8(write_only pipe int p, global int *ptr) { + // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__get_pipe_num_packets_wo(ptr %{{.*}}, i32 4, i32 4) *ptr = get_pipe_num_packets(p); + // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4) // CHECK: call i32 @__get_pipe_max_packets_wo(ptr %{{.*}}, i32 4, i32 4) *ptr = get_pipe_max_packets(p); } diff --git a/clang/test/Driver/aarch64-v96a.c b/clang/test/Driver/aarch64-v96a.c index 0aaadddb2842f84f89f8e85e30776e22c5acbde8..80c99be934334e684afd7b95bbd667e8eca26ead 100644 --- a/clang/test/Driver/aarch64-v96a.c +++ b/clang/test/Driver/aarch64-v96a.c @@ -6,7 +6,7 @@ // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s -// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a" +// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2" // RUN: %clang -target aarch64_be -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s // RUN: %clang -target aarch64_be -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s @@ -14,6 +14,42 @@ // RUN: %clang -target aarch64 -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s -// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a" -// +// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2" + // ===== Features supported on aarch64 ===== + +// RUN: %clang -target aarch64 -march=armv9.6a+f8f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F16MM %s +// RUN: %clang -target aarch64 -march=armv9.6-a+f8f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F16MM %s +// V96A-F8F16MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+f8f16mm" + +// RUN: %clang -target aarch64 -march=armv9.6a+f8f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F32MM %s +// RUN: %clang -target aarch64 -march=armv9.6-a+f8f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F32MM %s +// V96A-F8F32MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+f8f32mm" + +// RUN: %clang -target aarch64 -march=armv9.6a+lsfe -### -c %s 2>&1 | FileCheck -check-prefix=V96A-LSFE %s +// RUN: %clang -target aarch64 -march=armv9.6-a+lsfe -### -c %s 2>&1 | FileCheck -check-prefix=V96A-LSFE %s +// V96A-LSFE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+lsfe" + +// RUN: %clang -target aarch64 -march=armv9.6a+sme2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SME2p2 %s +// RUN: %clang -target aarch64 -march=armv9.6-a+sme2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SME2p2 %s +// V96A-SME2p2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sme2p2" + +// RUN: %clang -target aarch64 -march=armv9.6a+ssve-aes -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SSVE-AES %s +// RUN: %clang -target aarch64 -march=armv9.6-a+ssve-aes -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SSVE-AES %s +// V96A-SSVE-AES: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+ssve-aes" + +// RUN: %clang -target aarch64 -march=armv9.6a+sve2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE2p2 %s +// RUN: %clang -target aarch64 -march=armv9.6-a+sve2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE2p2 %s +// V96A-SVE2p2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve2p2" + +// RUN: %clang -target aarch64 -march=armv9.6a+sve-aes2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-AES2 %s +// RUN: %clang -target aarch64 -march=armv9.6-a+sve-aes2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-AES2 %s +// V96A-SVE-AES2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve-aes2" + +// RUN: %clang -target aarch64 -march=armv9.6a+sve-bfscale -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-BFSCALE %s +// RUN: %clang -target aarch64 -march=armv9.6-a+sve-bfscale -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-BFSCALE %s +// V96A-SVE-BFSCALE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve-bfscale" + +// RUN: %clang -target aarch64 -march=armv9.6a+sve-f16f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-F16F32MM %s +// RUN: %clang -target aarch64 -march=armv9.6-a+sve-f16f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-F16F32MM %s +// V96A-SVE-F16F32MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve-f16f32mm" diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 48d281bcd447e76b209249e78a2a28d4eced3b93..8191fda97788c1aff8a05c3952f1ab07cbc1ce3c 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -605,6 +605,9 @@ // RUN: %clang_cl -fmsc-version=1900 -TP -std:c++20 -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX20 %s // STDCXX20: -std=c++20 +// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++23preview -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX23PREVIEW %s +// STDCXX23PREVIEW: -std=c++23 + // RUN: %clang_cl -fmsc-version=1900 -TP -std:c++latest -### -- %s 2>&1 | FileCheck -check-prefix=STDCXXLATEST %s // STDCXXLATEST: -std=c++26 diff --git a/clang/test/Driver/flang/msvc-link.f90 b/clang/test/Driver/flang/msvc-link.f90 index 463749510eb5f88a38bb4538e40b4775b2b7b0e7..3f7e162a9a6116ae39c643d84ab5e5a9e8ea518f 100644 --- a/clang/test/Driver/flang/msvc-link.f90 +++ b/clang/test/Driver/flang/msvc-link.f90 @@ -1,5 +1,5 @@ -! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s -! -! Test that user provided paths come before the Flang runtimes -! CHECK: "-libpath:test" -! CHECK: "-libpath:{{.*(\\|/)}}lib" +! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s +! +! Test that user provided paths come before the Flang runtimes +! CHECK: "-libpath:test" +! CHECK: "-libpath:{{.*(\\|/)}}lib" diff --git a/clang/test/Driver/hip-include-path.hip b/clang/test/Driver/hip-include-path.hip index 1b4179e65c0b97c74bff96b38cc3089562e8f061..5eeee2f5ce0d5bc1a8beae20f3722d0434b9fe14 100644 --- a/clang/test/Driver/hip-include-path.hip +++ b/clang/test/Driver/hip-include-path.hip @@ -1,4 +1,3 @@ -// REQUIRES: libgcc // UNSUPPORTED: system-windows // RUN: %clang -c -### --target=x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \ diff --git a/clang/test/Driver/hip-runtime-libs-msvc.hip b/clang/test/Driver/hip-runtime-libs-msvc.hip index 8085e77d457e566d025f3a55b346ff8afd19dcd5..943cd0569f4fd1ab364c166c4eeb387854c6bd6b 100644 --- a/clang/test/Driver/hip-runtime-libs-msvc.hip +++ b/clang/test/Driver/hip-runtime-libs-msvc.hip @@ -1,5 +1,3 @@ -// REQUIRES: system-windows - // RUN: touch %t.o // Test HIP runtime lib args specified by --rocm-path. diff --git a/clang/test/Driver/loongarch-mannotate-tablejump.c b/clang/test/Driver/loongarch-mannotate-tablejump.c new file mode 100644 index 0000000000000000000000000000000000000000..586e62c35f4f76c6ff800be8be63e6fcc297f8bc --- /dev/null +++ b/clang/test/Driver/loongarch-mannotate-tablejump.c @@ -0,0 +1,13 @@ +/// Test -m[no-]annotate-tablejump options. + +// RUN: %clang --target=loongarch64 -mannotate-tablejump %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-ANOTATE +// RUN: %clang --target=loongarch64 -mno-annotate-tablejump %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-ANOTATE +// RUN: %clang --target=loongarch64 -mannotate-tablejump -mno-annotate-tablejump %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-ANOTATE +// RUN: %clang --target=loongarch64 -mno-annotate-tablejump -mannotate-tablejump %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-ANOTATE + +// CC1-ANOTATE: "-loongarch-annotate-tablejump" +// CC1-NO-ANOTATE-NOT: "-loongarch-annotate-tablejump" diff --git a/clang/test/Driver/loongarch-mfrecipe.c b/clang/test/Driver/loongarch-mfrecipe.c new file mode 100644 index 0000000000000000000000000000000000000000..14afd54af0b9df3955999d5f4817e01fb6a970bb --- /dev/null +++ b/clang/test/Driver/loongarch-mfrecipe.c @@ -0,0 +1,30 @@ +/// Test -m[no]frecipe options. + +// RUN: %clang --target=loongarch64 -mfrecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -mfrecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-FRECIPE +// RUN: %clang --target=loongarch64 -mfrecipe -mno-frecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-FRECIPE + +// RUN: %clang --target=loongarch64 -mfrecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-NO-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -mfrecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-FRECIPE +// RUN: %clang --target=loongarch64 -mfrecipe -mno-frecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-NO-FRECIPE + + +// CC1-FRECIPE: "-target-feature" "+frecipe" +// CC1-NO-FRECIPE: "-target-feature" "-frecipe" + +// IR-FRECIPE: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+frecipe{{(,.*)?}}" +// IR-NO-FRECIPE: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-frecipe{{(,.*)?}}" + +int foo(void) { + return 42; +} \ No newline at end of file diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c index e6247307c7219f26963d264e8746c63c0ad96e37..fbc0d70c4901c9d98acd04b80247c874b377d7fb 100644 --- a/clang/test/Driver/print-supported-extensions-aarch64.c +++ b/clang/test/Driver/print-supported-extensions-aarch64.c @@ -8,6 +8,7 @@ // CHECK-NEXT: bf16 FEAT_BF16 Enable BFloat16 Extension // CHECK-NEXT: brbe FEAT_BRBE Enable Branch Record Buffer Extension // CHECK-NEXT: bti FEAT_BTI Enable Branch Target Identification +// CHECK-NEXT: cmpbr FEAT_CMPBR Enable Armv9.6-A base compare and branch instructions // CHECK-NEXT: fcma FEAT_FCMA Enable Armv8.3-A Floating-point complex number support // CHECK-NEXT: cpa FEAT_CPA Enable Armv9.5-A Checked Pointer Arithmetic // CHECK-NEXT: crc FEAT_CRC32 Enable Armv8.0-A CRC-32 checksum instructions @@ -18,6 +19,8 @@ // CHECK-NEXT: dotprod FEAT_DotProd Enable dot product support // CHECK-NEXT: f32mm FEAT_F32MM Enable Matrix Multiply FP32 Extension // CHECK-NEXT: f64mm FEAT_F64MM Enable Matrix Multiply FP64 Extension +// CHECK-NEXT: f8f16mm FEAT_F8F16MM Enable Armv9.6-A FP8 to Half-Precision Matrix Multiplication +// CHECK-NEXT: f8f32mm FEAT_F8F32MM Enable Armv9.6-A FP8 to Single-Precision Matrix Multiplication // CHECK-NEXT: faminmax FEAT_FAMINMAX Enable FAMIN and FAMAX instructions // CHECK-NEXT: flagm FEAT_FlagM Enable Armv8.4-A Flag Manipulation instructions // CHECK-NEXT: fp FEAT_FP Enable Armv8.0-A Floating Point Extensions @@ -26,6 +29,7 @@ // CHECK-NEXT: fp8dot2 FEAT_FP8DOT2 Enable FP8 2-way dot instructions // CHECK-NEXT: fp8dot4 FEAT_FP8DOT4 Enable FP8 4-way dot instructions // CHECK-NEXT: fp8fma FEAT_FP8FMA Enable Armv9.5-A FP8 multiply-add instructions +// CHECK-NEXT: fprcvt FEAT_FPRCVT Enable Armv9.6-A base convert instructions for SIMD&FP scalar register operands of different input and output sizes // CHECK-NEXT: fp16 FEAT_FP16 Enable half-precision floating-point data processing // CHECK-NEXT: gcs FEAT_GCS Enable Armv9.4-A Guarded Call Stack Extension // CHECK-NEXT: hbc FEAT_HBC Enable Armv8.8-A Hinted Conditional Branches Extension @@ -35,6 +39,7 @@ // CHECK-NEXT: ls64 FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA Enable Armv8.7-A LD64B/ST64B Accelerator Extension // CHECK-NEXT: lse FEAT_LSE Enable Armv8.1-A Large System Extension (LSE) atomic instructions // CHECK-NEXT: lse128 FEAT_LSE128 Enable Armv9.4-A 128-bit Atomic instructions +// CHECK-NEXT: lsfe FEAT_LSFE Enable Armv9.6-A base Atomic floating-point in-memory instructions // CHECK-NEXT: lut FEAT_LUT Enable Lookup Table instructions // CHECK-NEXT: mops FEAT_MOPS Enable Armv8.8-A memcpy and memset acceleration instructions // CHECK-NEXT: memtag FEAT_MTE, FEAT_MTE2 Enable Memory Tagging Extension @@ -64,20 +69,26 @@ // CHECK-NEXT: sme-lutv2 FEAT_SME_LUTv2 Enable Scalable Matrix Extension (SME) LUTv2 instructions // CHECK-NEXT: sme2 FEAT_SME2 Enable Scalable Matrix Extension 2 (SME2) instructions // CHECK-NEXT: sme2p1 FEAT_SME2p1 Enable Scalable Matrix Extension 2.1 instructions +// CHECK-NEXT: sme2p2 FEAT_SME2p2 Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions // CHECK-NEXT: profile FEAT_SPE Enable Statistical Profiling extension // CHECK-NEXT: predres2 FEAT_SPECRES2 Enable Speculation Restriction Instruction // CHECK-NEXT: ssbs FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit +// CHECK-NEXT: ssve-aes FEAT_SSVE_AES Enable Armv9.6-A SVE2 AES support in streaming SVE mode // CHECK-NEXT: ssve-fp8dot2 FEAT_SSVE_FP8DOT2 Enable SVE2 FP8 2-way dot product instructions // CHECK-NEXT: ssve-fp8dot4 FEAT_SSVE_FP8DOT4 Enable SVE2 FP8 4-way dot product instructions // CHECK-NEXT: ssve-fp8fma FEAT_SSVE_FP8FMA Enable SVE2 FP8 multiply-add instructions // CHECK-NEXT: sve FEAT_SVE Enable Scalable Vector Extension (SVE) instructions +// CHECK-NEXT: sve-aes2 FEAT_SVE_AES2 Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions // CHECK-NEXT: sve-b16b16 FEAT_SVE_B16B16 Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions +// CHECK-NEXT: sve-bfscale FEAT_SVE_BFSCALE Enable Armv9.6-A SVE BFloat16 scaling instructions +// CHECK-NEXT: sve-f16f32mm FEAT_SVE_F16F32MM Enable Armv9.6-A FP16 to FP32 Matrix Multiply // CHECK-NEXT: sve2 FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions // CHECK-NEXT: sve2-aes FEAT_SVE_AES, FEAT_SVE_PMULL128 Enable AES SVE2 instructions // CHECK-NEXT: sve2-bitperm FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions // CHECK-NEXT: sve2-sha3 FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions // CHECK-NEXT: sve2-sm4 FEAT_SVE_SM4 Enable SM4 SVE2 instructions // CHECK-NEXT: sve2p1 FEAT_SVE2p1 Enable Scalable Vector Extension 2.1 instructions +// CHECK-NEXT: sve2p2 FEAT_SVE2p2 Enable Armv9.6-A Scalable Vector Extension 2.2 instructions // CHECK-NEXT: the FEAT_THE Enable Armv8.9-A Translation Hardening Extension // CHECK-NEXT: tlbiw FEAT_TLBIW Enable Armv9.5-A TLBI VMALL for Dirty State // CHECK-NEXT: tme FEAT_TME Enable Transactional Memory Extension diff --git a/clang/test/Driver/stack-protector-guard.c b/clang/test/Driver/stack-protector-guard.c index d8475a70e3709fbc4e3c9c7cf8459e4b775e4c92..666c83079e51916bf2cde7613d46f8393f077d14 100644 --- a/clang/test/Driver/stack-protector-guard.c +++ b/clang/test/Driver/stack-protector-guard.c @@ -17,15 +17,15 @@ // RUN: FileCheck -check-prefix=CHECK-SYM %s // Invalid arch -// RUN: not %clang -target powerpc64le-linux-gnu -mstack-protector-guard=tls %s 2>&1 | \ +// RUN: not %clang -target mipsel-linux-gnu -mstack-protector-guard=tls %s 2>&1 | \ // RUN: FileCheck -check-prefix=INVALID-ARCH %s // INVALID-ARCH: unsupported option '-mstack-protector-guard=tls' for target -// RUN: not %clang -target powerpc64le-linux-gnu -mstack-protector-guard-reg=fs %s 2>&1 | \ +// RUN: not %clang -target mipsel-linux-gnu -mstack-protector-guard-reg=fs %s 2>&1 | \ // RUN: FileCheck -check-prefix=INVALID-ARCH2 %s // INVALID-ARCH2: unsupported option '-mstack-protector-guard-reg=fs' for target -// RUN: not %clang -target powerpc64le-linux-gnu -mstack-protector-guard-offset=10 %s 2>&1 | \ +// RUN: not %clang -target mipsel-linux-gnu -mstack-protector-guard-offset=10 %s 2>&1 | \ // RUN: FileCheck -check-prefix=INVALID-ARCH3 %s // INVALID-ARCH3: unsupported option '-mstack-protector-guard-offset=10' for target @@ -104,3 +104,54 @@ // RUN: FileCheck -check-prefix=INVALID-REG-RISCV %s // INVALID-REG-RISCV: error: invalid value 'sp' in 'mstack-protector-guard-reg=', expected one of: tp + +// RUN: %clang -### -target powerpc64-unknown-elf -mstack-protector-guard=tls -mstack-protector-guard-offset=24 -mstack-protector-guard-reg=r13 %s 2>&1 | \ +// RUN: FileCheck -v -check-prefix=CHECK-TLS-POWERPC64 %s +// RUN: %clang -### -target powerpc64-unknown-linux-gnu -mstack-protector-guard=global %s 2>&1 | \ +// RUN: FileCheck -check-prefix=CHECK-GLOBAL %s + +// RUN: not %clang -target powerpc64-unknown-linux-gnu -mstack-protector-guard=tls %s 2>&1 | \ +// RUN: FileCheck -check-prefix=MISSING-OFFSET %s + +// RUN: not %clang -target powerpc64-unknown-elf -mstack-protector-guard=sysreg %s 2>&1 | \ +// RUN: FileCheck -check-prefix=INVALID-VALUE2 %s + +// RUN: not %clang -target powerpc64-unknown-elf -mstack-protector-guard=tls \ +// RUN: -mstack-protector-guard-offset=20 -mstack-protector-guard-reg=r12 %s 2>&1 | \ +// RUN: FileCheck -check-prefix=INVALID-REG-POWERPC64 %s + +// CHECK-TLS-POWERPC64: "-cc1" {{.*}}"-mstack-protector-guard=tls" "-mstack-protector-guard-offset=24" "-mstack-protector-guard-reg=r13" +// INVALID-REG-POWERPC64: error: invalid value 'r12' in 'mstack-protector-guard-reg=', expected one of: r13 + +// RUN: %clang -### -target powerpc64le-unknown-elf -mstack-protector-guard=tls -mstack-protector-guard-offset=24 -mstack-protector-guard-reg=r13 %s 2>&1 | \ +// RUN: FileCheck -v -check-prefix=CHECK-TLS-POWERPC64 %s +// RUN: %clang -### -target powerpc64le-unknown-elf -mstack-protector-guard=global %s 2>&1 | \ +// RUN: FileCheck -check-prefix=CHECK-GLOBAL %s + +// RUN: not %clang -target powerpc64le-unknown-elf -mstack-protector-guard=tls %s 2>&1 | \ +// RUN: FileCheck -check-prefix=MISSING-OFFSET %s + +// RUN: not %clang -target powerpc64le-unknown-elf -mstack-protector-guard=sysreg %s 2>&1 | \ +// RUN: FileCheck -check-prefix=INVALID-VALUE2 %s + +// RUN: not %clang -target powerpc64le-unknown-elf -mstack-protector-guard=tls \ +// RUN: -mstack-protector-guard-offset=20 -mstack-protector-guard-reg=r12 %s 2>&1 | \ +// RUN: FileCheck -check-prefix=INVALID-REG-POWERPC64 %s + +// RUN: %clang -### -target ppc32-unknown-elf -mstack-protector-guard=tls -mstack-protector-guard-offset=24 -mstack-protector-guard-reg=r2 %s 2>&1 | \ +// RUN: FileCheck -v -check-prefix=CHECK-TLS-POWERPC32 %s +// RUN: %clang -### -target ppc32-unknown-elf -mstack-protector-guard=global %s 2>&1 | \ +// RUN: FileCheck -check-prefix=CHECK-GLOBAL %s + +// RUN: not %clang -target ppc32-unknown-elf -mstack-protector-guard=tls %s 2>&1 | \ +// RUN: FileCheck -check-prefix=MISSING-OFFSET %s + +// RUN: not %clang -target ppc32-unknown-elf -mstack-protector-guard=sysreg %s 2>&1 | \ +// RUN: FileCheck -check-prefix=INVALID-VALUE2 %s + +// RUN: not %clang -target ppc32-unknown-elf -mstack-protector-guard=tls \ +// RUN: -mstack-protector-guard-offset=20 -mstack-protector-guard-reg=r3 %s 2>&1 | \ +// RUN: FileCheck -check-prefix=INVALID-REG-POWERPC32 %s + +// CHECK-TLS-POWERPC32: "-cc1" {{.*}}"-mstack-protector-guard=tls" "-mstack-protector-guard-offset=24" "-mstack-protector-guard-reg=r2" +// INVALID-REG-POWERPC32: error: invalid value 'r3' in 'mstack-protector-guard-reg=', expected one of: r2 diff --git a/clang/test/ExtractAPI/anonymous_record_no_typedef.c b/clang/test/ExtractAPI/anonymous_record_no_typedef.c index 064c223ad56e73fc10d5d2bc2f48c63607d800c3..c0c76ef1f06b57f23131c9636bf19f76d0140a7f 100644 --- a/clang/test/ExtractAPI/anonymous_record_no_typedef.c +++ b/clang/test/ExtractAPI/anonymous_record_no_typedef.c @@ -1,11 +1,18 @@ // RUN: rm -rf %t // RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \ // RUN: -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \ -// RUN: -x c-header %s -o %t/output.symbols.json -verify +// RUN: -x c-header %s -o %t/output-c.symbols.json -verify +// +// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \ +// RUN: -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \ +// RUN: -x c++-header %s -o %t/output-cxx.symbols.json -verify -// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBAL -// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix PREFIX -// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CONTENT +// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBAL +// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix PREFIX +// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CONTENT +// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBAL +// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix PREFIX +// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CONTENT /// A global variable with an anonymous struct type. struct { char *prefix; char *content; } global; // GLOBAL-LABEL: "!testLabel": "c:@global" @@ -30,7 +37,7 @@ struct { char *prefix; char *content; } global; // GLOBAL: "text": "A global variable with an anonymous struct type." // GLOBAL: "kind": { // GLOBAL-NEXT: "displayName": "Global Variable", -// GLOBAL-NEXT: "identifier": "c.var" +// GLOBAL-NEXT: "identifier": "c{{(\+\+)?}}.var" // GLOBAL: "title": "global" // GLOBAL: "pathComponents": [ // GLOBAL-NEXT: "global" @@ -54,9 +61,12 @@ struct { char *prefix; char *content; } global; /// A Vehicle struct Vehicle { - // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix TYPE - // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix BICYCLE - // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CAR + // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix TYPE + // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix BICYCLE + // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CAR + // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix TYPE + // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix BICYCLE + // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CAR /// The type of vehicle. enum { Bicycle, @@ -96,9 +106,12 @@ struct Vehicle { // CAR-NEXT: "Car" // CAR-NEXT: ] - // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix INFORMATION - // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix WHEELS - // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix NAME + // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix INFORMATION + // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix WHEELS + // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix NAME + // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix INFORMATION + // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix WHEELS + // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix NAME /// The information about the vehicle. union { int wheels; @@ -145,8 +158,10 @@ struct Vehicle { // NAME-NEXT: ] }; -// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALCASE -// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALOTHERCASE +// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALCASE +// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALOTHERCASE +// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALCASE +// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALOTHERCASE enum { GlobalCase, GlobalOtherCase @@ -163,7 +178,8 @@ enum { // GLOBALOTHERCASE-NEXT: "GlobalOtherCase" // GLOBALOTHERCASE-NEXT: ] -// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix VEC +// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix VEC +// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix VEC union Vector { struct { float X; diff --git a/clang/test/ExtractAPI/typedef_anonymous_record.c b/clang/test/ExtractAPI/typedef_anonymous_record.c index 8e298f8d9ce829d44fe917d76d2241da96197c69..c100e30803e4c83843e2d2d73140005e15ff10c0 100644 --- a/clang/test/ExtractAPI/typedef_anonymous_record.c +++ b/clang/test/ExtractAPI/typedef_anonymous_record.c @@ -1,8 +1,11 @@ // RUN: rm -rf %t // RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \ -// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain.symbols.json -verify +// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain-c.symbols.json -verify +// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \ +// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c++-header %s -o %t/typedefchain-cxx.symbols.json -verify -// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCT +// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCT +// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCT typedef struct { } MyStruct; // MYSTRUCT-LABEL: "!testLabel": "c:@SA@MyStruct" // MYSTRUCT: "accessLevel": "public", @@ -34,7 +37,7 @@ typedef struct { } MyStruct; // MYSTRUCT-NEXT: ] // MYSTRUCT: "kind": { // MYSTRUCT-NEXT: "displayName": "Structure", -// MYSTRUCT-NEXT: "identifier": "c.struct" +// MYSTRUCT-NEXT: "identifier": "c{{(\+\+)?}}.struct" // MYSTRUCT: "names": { // MYSTRUCT-NEXT: "navigator": [ // MYSTRUCT-NEXT: { @@ -54,7 +57,8 @@ typedef struct { } MyStruct; // MYSTRUCT-NEXT: "MyStruct" // MYSTRUCT-NEXT: ] -// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCTSTRUCT +// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCTSTRUCT +// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCTSTRUCT typedef MyStruct MyStructStruct; // MYSTRUCTSTRUCT-LABEL: "!testLabel": "c:typedef_anonymous_record.c@T@MyStructStruct" // MYSTRUCTSTRUCT: "accessLevel": "public", @@ -87,10 +91,12 @@ typedef MyStruct MyStructStruct; // MYSTRUCTSTRUCT-NEXT:], // MYSTRUCTSTRUCT: "kind": { // MYSTRUCTSTRUCT-NEXT: "displayName": "Type Alias", -// MYSTRUCTSTRUCT-NEXT: "identifier": "c.typealias" +// MYSTRUCTSTRUCT-NEXT: "identifier": "c{{(\+\+)?}}.typealias" -// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUM -// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix CASE +// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUM +// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix CASE +// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUM +// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix CASE typedef enum { Case } MyEnum; // MYENUM: "source": "c:@EA@MyEnum@Case", // MYENUM-NEXT: "target": "c:@EA@MyEnum", @@ -124,7 +130,7 @@ typedef enum { Case } MyEnum; // MYENUM-NEXT:], // MYENUM: "kind": { // MYENUM-NEXT: "displayName": "Enumeration", -// MYENUM-NEXT: "identifier": "c.enum" +// MYENUM-NEXT: "identifier": "c{{(\+\+)?}}.enum" // MYENUM: "names": { // MYENUM-NEXT: "navigator": [ // MYENUM-NEXT: { @@ -147,7 +153,8 @@ typedef enum { Case } MyEnum; // CASE-NEXT: "Case" // CASE-NEXT: ] -// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUMENUM +// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUMENUM +// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUMENUM typedef MyEnum MyEnumEnum; // MYENUMENUM-LABEL: "!testLabel": "c:typedef_anonymous_record.c@T@MyEnumEnum" // MYENUMENUM: "declarationFragments": [ @@ -179,7 +186,7 @@ typedef MyEnum MyEnumEnum; // MYENUMENUM-NEXT: ], // MYENUMENUM: "kind": { // MYENUMENUM-NEXT: "displayName": "Type Alias", -// MYENUMENUM-NEXT: "identifier": "c.typealias" +// MYENUMENUM-NEXT: "identifier": "c{{(\+\+)?}}.typealias" // MYENUMENUM-NEXT: }, // MYENUMENUM: "title": "MyEnumEnum" diff --git a/clang/test/FixIt/fixit-newline-style.c b/clang/test/FixIt/fixit-newline-style.c index 61e4df67e85bacb88182e0e88bf85a4fa8852ccb..2aac143d4d753efa229bd94801f4811515e534df 100644 --- a/clang/test/FixIt/fixit-newline-style.c +++ b/clang/test/FixIt/fixit-newline-style.c @@ -1,11 +1,11 @@ -// RUN: %clang_cc1 -pedantic -Wunused-label -fno-diagnostics-show-line-numbers -x c %s 2>&1 | FileCheck %s -strict-whitespace - -// This file intentionally uses a CRLF newline style -// CHECK: warning: unused label 'ddd' -// CHECK-NEXT: {{^ ddd:}} -// CHECK-NEXT: {{^ \^~~~$}} -// CHECK-NOT: {{^ ;}} -void f(void) { - ddd: - ; -} +// RUN: %clang_cc1 -pedantic -Wunused-label -fno-diagnostics-show-line-numbers -x c %s 2>&1 | FileCheck %s -strict-whitespace + +// This file intentionally uses a CRLF newline style +// CHECK: warning: unused label 'ddd' +// CHECK-NEXT: {{^ ddd:}} +// CHECK-NEXT: {{^ \^~~~$}} +// CHECK-NOT: {{^ ;}} +void f(void) { + ddd: + ; +} diff --git a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c index d6724444c0667667a03c7ab061453ed564237a1b..2faeaba322921844a642d7c4a0a1c0082b53d1d6 100644 --- a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c +++ b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c @@ -1,8 +1,8 @@ -// RUN: %clang_cc1 -E -frewrite-includes %s | %clang_cc1 - -// expected-no-diagnostics -// Note: This source file has CRLF line endings. -// This test validates that -frewrite-includes translates the end of line (EOL) -// form used in header files to the EOL form used in the the primary source -// file when the files use different EOL forms. -#include "rewrite-includes-mixed-eol-crlf.h" -#include "rewrite-includes-mixed-eol-lf.h" +// RUN: %clang_cc1 -E -frewrite-includes %s | %clang_cc1 - +// expected-no-diagnostics +// Note: This source file has CRLF line endings. +// This test validates that -frewrite-includes translates the end of line (EOL) +// form used in header files to the EOL form used in the the primary source +// file when the files use different EOL forms. +#include "rewrite-includes-mixed-eol-crlf.h" +#include "rewrite-includes-mixed-eol-lf.h" diff --git a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h index 0439b88b75e2cfb99be8b2d456d9b022db2eabe5..baedc282296bd7878da8b48f38bd6d0a37e65b56 100644 --- a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h +++ b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h @@ -1,11 +1,11 @@ -// Note: This header file has CRLF line endings. -// The indentation in some of the conditional inclusion directives below is -// intentional and is required for this test to function as a regression test -// for GH59736. -_Static_assert(__LINE__ == 5, ""); -#if 1 -_Static_assert(__LINE__ == 7, ""); - #if 1 - _Static_assert(__LINE__ == 9, ""); - #endif -#endif +// Note: This header file has CRLF line endings. +// The indentation in some of the conditional inclusion directives below is +// intentional and is required for this test to function as a regression test +// for GH59736. +_Static_assert(__LINE__ == 5, ""); +#if 1 +_Static_assert(__LINE__ == 7, ""); + #if 1 + _Static_assert(__LINE__ == 9, ""); + #endif +#endif diff --git a/clang/test/Frontend/system-header-line-directive-ms-lineendings.c b/clang/test/Frontend/system-header-line-directive-ms-lineendings.c index 92fc07f65e0d4d85bc857c0dd5144e7b1f213d0a..dffdd5cf1959ae7d93e4470201c82be158948223 100644 --- a/clang/test/Frontend/system-header-line-directive-ms-lineendings.c +++ b/clang/test/Frontend/system-header-line-directive-ms-lineendings.c @@ -1,21 +1,21 @@ -// RUN: %clang_cc1 %s -E -o - -I %S/Inputs -isystem %S/Inputs/SystemHeaderPrefix | FileCheck %s -#include -#include - -#include "line-directive.h" - -// This tests that the line numbers for the current file are correctly outputted -// for the include-file-completed test case. This file should be CRLF. - -// CHECK: # 1 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 -// CHECK: # 1 "{{.*}}noline.h" 1 3 -// CHECK: foo(void); -// CHECK: # 3 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 -// CHECK: # 1 "{{.*}}line-directive-in-system.h" 1 3 -// The "3" below indicates that "foo.h" is considered a system header. -// CHECK: # 1 "foo.h" 3 -// CHECK: foo(void); -// CHECK: # 4 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 -// CHECK: # 1 "{{.*}}line-directive.h" 1 -// CHECK: # 10 "foo.h"{{$}} -// CHECK: # 6 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// RUN: %clang_cc1 %s -E -o - -I %S/Inputs -isystem %S/Inputs/SystemHeaderPrefix | FileCheck %s +#include +#include + +#include "line-directive.h" + +// This tests that the line numbers for the current file are correctly outputted +// for the include-file-completed test case. This file should be CRLF. + +// CHECK: # 1 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// CHECK: # 1 "{{.*}}noline.h" 1 3 +// CHECK: foo(void); +// CHECK: # 3 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// CHECK: # 1 "{{.*}}line-directive-in-system.h" 1 3 +// The "3" below indicates that "foo.h" is considered a system header. +// CHECK: # 1 "foo.h" 3 +// CHECK: foo(void); +// CHECK: # 4 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// CHECK: # 1 "{{.*}}line-directive.h" 1 +// CHECK: # 10 "foo.h"{{$}} +// CHECK: # 6 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 diff --git a/clang/test/Modules/friend-definition-2.cpp b/clang/test/Modules/friend-definition-2.cpp index 41c2141f4013925dbf9a2ee7b14818b9ccd9c97f..d91ce14722b45fc4c5dabe5723e8576de7882994 100644 --- a/clang/test/Modules/friend-definition-2.cpp +++ b/clang/test/Modules/friend-definition-2.cpp @@ -1,32 +1,53 @@ -// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify -// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify -triple i686-windows -// expected-no-diagnostics -#pragma clang module build A -module A {} -#pragma clang module contents -#pragma clang module begin A +// RUN: split-file %s %t + +// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm +// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm +// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \ +// RUN: -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \ +// RUN: %t/use.cc -verify + +// RUN: rm -f %t/*.pcm + +// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm -triple i686-windows +// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm -triple i686-windows +// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \ +// RUN: -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \ +// RUN: %t/use.cc -verify -triple i686-windows + +//--- a.modulemap +module A { + header "a.h" +} + +//--- a.h +#ifndef A_H +#define A_H +template struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); }; +#endif + +//--- b.modulemap +module B { + header "b.h" +} + +//--- b.h +#ifndef B_H +#define B_H template struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); }; -#pragma clang module end -#pragma clang module endbuild - -#pragma clang module build B -module B {} -#pragma clang module contents -#pragma clang module begin B -template struct ct { friend auto operator-(ct, ct) { struct X{}; return X(); } void x(); }; inline auto f() { return ct() - ct(); } -#pragma clang module end -#pragma clang module endbuild +#endif +//--- use.cc +// expected-no-diagnostics // Force the definition of ct in module A to be the primary definition. -#pragma clang module import A +#include "a.h" template void ct::x() {} // Attempt to cause the definition of operator- in the ct primary template in // module B to be the primary definition of that function. If that happens, // we'll be left with a class template ct that appears to not contain a // definition of the inline friend function. -#pragma clang module import B +#include "b.h" auto v = f(); ct make(); diff --git a/clang/test/Modules/no-external-type-id.cppm b/clang/test/Modules/no-external-type-id.cppm index 068e52646dcc1c10717399838383b4a9193bae62..a4ca389739fbb5c309949fe207d11b1d62611629 100644 --- a/clang/test/Modules/no-external-type-id.cppm +++ b/clang/test/Modules/no-external-type-id.cppm @@ -23,7 +23,7 @@ export module b; import a; export int b(); -// CHECK: (); // CHECK: @_ZTVW3Mod11NonTemplate = {{.*}}external // CHECK: @_ZTVW3Mod8TemplateIcE = {{.*}}external // CHECK: @_ZTVW3Mod8TemplateIjE = {{.*}}weak_odr -// CHECK: @_ZTSW3Mod8TemplateIjE = {{.*}}weak_odr // CHECK: @_ZTIW3Mod8TemplateIjE = {{.*}}weak_odr +// CHECK: @_ZTSW3Mod8TemplateIjE = {{.*}}weak_odr // CHECK: @_ZTVW3Mod8TemplateIdE = {{.*}}external // CHECK: @_ZTVW3Mod8TemplateIiE = {{.*}}linkonce_odr -// CHECK: @_ZTSW3Mod8TemplateIiE = {{.*}}linkonce_odr // CHECK: @_ZTIW3Mod8TemplateIiE = {{.*}}linkonce_odr +// CHECK: @_ZTSW3Mod8TemplateIiE = {{.*}}linkonce_odr // CHECK: @_ZTVW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr -// CHECK: @_ZTSW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr // CHECK: @_ZTIW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr +// CHECK: @_ZTSW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp index 240b22a30671437cace156ffbcef4008f80dc8a6..eabf233dde24745188fca9d829e0436ac9558339 100644 --- a/clang/test/OpenMP/bug57757.cpp +++ b/clang/test/OpenMP/bug57757.cpp @@ -39,7 +39,7 @@ void foo() { // CHECK-NEXT: ] // CHECK: .untied.jmp..i: // CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope [[META13]], !noalias [[META17]] -// CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias [[META13]] +// CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr nonnull [[TMP1]]), !noalias [[META13]] // CHECK-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK: .untied.next..i: // CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 diff --git a/clang/test/ParserHLSL/bitfields.hlsl b/clang/test/ParserHLSL/bitfields.hlsl index 307d1143a068e2358cba56c181ea8f48766f958a..57b6705babdc12db2cdaf1cdbd428fd308b7711d 100644 --- a/clang/test/ParserHLSL/bitfields.hlsl +++ b/clang/test/ParserHLSL/bitfields.hlsl @@ -1,31 +1,31 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -x hlsl -o - %s | FileCheck %s - - -struct MyBitFields { - // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field1 'unsigned int' - // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' - // CHECK:-value: Int 3 - // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 3 - unsigned int field1 : 3; // 3 bits for field1 - - // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field2 'unsigned int' - // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' - // CHECK:-value: Int 4 - // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 4 - unsigned int field2 : 4; // 4 bits for field2 - - // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:7 field3 'int' - // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' - // CHECK:-value: Int 5 - // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 5 - int field3 : 5; // 5 bits for field3 (signed) -}; - - - -[numthreads(1,1,1)] -void main() { - MyBitFields m; - m.field1 = 4; - m.field2 = m.field1*2; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -x hlsl -o - %s | FileCheck %s + + +struct MyBitFields { + // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field1 'unsigned int' + // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' + // CHECK:-value: Int 3 + // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 3 + unsigned int field1 : 3; // 3 bits for field1 + + // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field2 'unsigned int' + // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' + // CHECK:-value: Int 4 + // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 4 + unsigned int field2 : 4; // 4 bits for field2 + + // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:7 field3 'int' + // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' + // CHECK:-value: Int 5 + // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 5 + int field3 : 5; // 5 bits for field3 (signed) +}; + + + +[numthreads(1,1,1)] +void main() { + MyBitFields m; + m.field1 = 4; + m.field2 = m.field1*2; } \ No newline at end of file diff --git a/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl index 2eebc920388b5b278a0273df6fabf37e6c86385b..5b228d039345e1f37a31ecdfb5d31bcfb36ec236 100644 --- a/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl +++ b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl @@ -1,21 +1,21 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// tests that hlsl annotations are properly parsed when applied on field decls, -// and that the annotation gets properly placed on the AST. - -struct Eg9{ - // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:8 implicit struct Eg9 - // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced a 'unsigned int' - // CHECK: -HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-f]+}} - unsigned int a : SV_DispatchThreadID; -}; -Eg9 e9; - - -RWBuffer In : register(u1); - - -[numthreads(1,1,1)] -void main() { - In[0] = e9.a; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// tests that hlsl annotations are properly parsed when applied on field decls, +// and that the annotation gets properly placed on the AST. + +struct Eg9{ + // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:8 implicit struct Eg9 + // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced a 'unsigned int' + // CHECK: -HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-f]+}} + unsigned int a : SV_DispatchThreadID; +}; +Eg9 e9; + + +RWBuffer In : register(u1); + + +[numthreads(1,1,1)] +void main() { + In[0] = e9.a; +} diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl index 2b7bd31023222e934784726ffbe3093737ce3514..476ec39e14da98c60c49f1d609ff51866de773e5 100644 --- a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl @@ -1,28 +1,25 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -ast-dump -o - %s | FileCheck %s - -typedef vector float4; - -// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} -// CHECK: -HLSLAttributedResourceType 0x{{[0-9a-f]+}} '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]] -// CHECK-SAME: ' sugar -using ResourceIntAliasT = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(int)]]; -ResourceIntAliasT h1; - -// CHECK: -VarDecl 0x{{[0-9a-f]+}} col:82 h2 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float4)]] -// CHECK-SAME: ':'__hlsl_resource_t' -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float4)]] h2; - -// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 7]]:30 S -// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:20 referenced typename depth 0 index 0 T -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 5]]:30 struct S definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:79 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(T)]] -// CHECK-SAME: ':'__hlsl_resource_t' -template struct S { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(T)]] h; -}; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -ast-dump -o - %s | FileCheck %s + +typedef vector float4; + +// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} +// CHECK: -HLSLAttributedResourceType 0x{{[0-9a-f]+}} '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]] +using ResourceIntAliasT = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(int)]]; +ResourceIntAliasT h1; + +// CHECK: -VarDecl 0x{{[0-9a-f]+}} col:82 h2 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float4)]] +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float4)]] h2; + +// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 6]]:30 S +// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:20 referenced typename depth 0 index 0 T +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:30 struct S definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:79 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(T)]] +template struct S { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(T)]] h; +}; diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl index b2d492d95945c168858cee365f7cbae966cb0f20..673ff8693b83b8539130af6499d31ee8b2f19a01 100644 --- a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl @@ -1,28 +1,28 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -o - %s -verify - -typedef vector float4; - -// expected-error@+1{{'contained_type' attribute cannot be applied to a declaration}} -[[hlsl::contained_type(float4)]] __hlsl_resource_t h1; - -// expected-error@+1{{'contained_type' attribute takes one argument}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type()]] h3; - -// expected-error@+1{{expected a type}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(0)]] h4; - -// expected-error@+1{{unknown type name 'a'}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(a)]] h5; - -// expected-error@+1{{expected a type}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type("b", c)]] h6; - -// expected-warning@+1{{attribute 'contained_type' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(float)]] h7; - -// expected-warning@+1{{attribute 'contained_type' is already applied with different arguments}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(int)]] h8; - -// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -// expected-error@+1{{attribute 'contained_type' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] res5; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -o - %s -verify + +typedef vector float4; + +// expected-error@+1{{'contained_type' attribute cannot be applied to a declaration}} +[[hlsl::contained_type(float4)]] __hlsl_resource_t h1; + +// expected-error@+1{{'contained_type' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type()]] h3; + +// expected-error@+1{{expected a type}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(0)]] h4; + +// expected-error@+1{{unknown type name 'a'}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(a)]] h5; + +// expected-error@+1{{expected a type}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type("b", c)]] h6; + +// expected-warning@+1{{attribute 'contained_type' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(float)]] h7; + +// expected-warning@+1{{attribute 'contained_type' is already applied with different arguments}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(int)]] h8; + +// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +// expected-error@+1{{attribute 'contained_type' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] res5; diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl index fdf2aacf4a4dedc7f018203e29f5228a34d0cee4..487dc32413032daeede8363e2f6b8e15ed0a9596 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl @@ -1,25 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:68 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -// CHECK-SAME: ':'__hlsl_resource_t' -struct MyBuffer { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h; -}; - -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:66 res '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -// CHECK-SAME: ':'__hlsl_resource_t' -__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res; - -// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 5]]:6 f 'void () -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 r '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -// CHECK-SAME: ':'__hlsl_resource_t' -void f() { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:68 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h; +}; + +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:66 res '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res; + +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 r '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +void f() { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r; +} diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl index 3b2c12e7a96c5c3b0fa6ca689c7496026893c9fc..9bb64ea990e284232f7a71f977becd0d7f67f954 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl @@ -1,20 +1,20 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify - -// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}} -[[hlsl::is_rov]] __hlsl_resource_t res0; - -// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}} -__hlsl_resource_t [[hlsl::is_rov]] res1; - -// expected-error@+1{{'is_rov' attribute takes no arguments}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; - -// expected-error@+1{{use of undeclared identifier 'gibberish'}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3; - -// expected-warning@+1{{attribute 'is_rov' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4; - -// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -// expected-error@+1{{attribute 'is_rov' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] res5; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify + +// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}} +[[hlsl::is_rov]] __hlsl_resource_t res0; + +// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}} +__hlsl_resource_t [[hlsl::is_rov]] res1; + +// expected-error@+1{{'is_rov' attribute takes no arguments}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; + +// expected-error@+1{{use of undeclared identifier 'gibberish'}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3; + +// expected-warning@+1{{attribute 'is_rov' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4; + +// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +// expected-error@+1{{attribute 'is_rov' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] res5; diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl index 71bf300ee7aec5bfc732ab72c4947d19c078e9f4..e09ed5586c1025b4431a8bc82cfccaeba60c175d 100644 --- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl @@ -1,25 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:72 h1 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME: ':'__hlsl_resource_t' -struct MyBuffer { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h1; -}; - -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:70 h2 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME: ':'__hlsl_resource_t' -__hlsl_resource_t [[hlsl::raw_buffer]] [[hlsl::resource_class(SRV)]] h2; - -// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 5]]:6 f 'void () -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 h3 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME: ':'__hlsl_resource_t' -void f() { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h3; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:72 h1 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h1; +}; + +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:70 h2 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +__hlsl_resource_t [[hlsl::raw_buffer]] [[hlsl::resource_class(SRV)]] h2; + +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 h3 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +void f() { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h3; +} diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl index 77530cbf9e4d92abb1ad6c9470d7d0ba874233d4..a10aca4e96fc5362547caa9c9ffbc4f955e6b9ca 100644 --- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl @@ -1,17 +1,17 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify - -// expected-error@+1{{'raw_buffer' attribute cannot be applied to a declaration}} -[[hlsl::raw_buffer]] __hlsl_resource_t res0; - -// expected-error@+1{{'raw_buffer' attribute takes no arguments}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(3)]] res2; - -// expected-error@+1{{use of undeclared identifier 'gibberish'}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]] res3; - -// expected-warning@+1{{attribute 'raw_buffer' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] [[hlsl::raw_buffer]] res4; - -// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -// expected-error@+1{{attribute 'raw_buffer' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] res5; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify + +// expected-error@+1{{'raw_buffer' attribute cannot be applied to a declaration}} +[[hlsl::raw_buffer]] __hlsl_resource_t res0; + +// expected-error@+1{{'raw_buffer' attribute takes no arguments}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(3)]] res2; + +// expected-error@+1{{use of undeclared identifier 'gibberish'}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]] res3; + +// expected-warning@+1{{attribute 'raw_buffer' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] [[hlsl::raw_buffer]] res4; + +// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +// expected-error@+1{{attribute 'raw_buffer' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] res5; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl index 305fd95ab1ebc70ede1d29d1e56fb4f6548cf73a..9fee9edddf619acfb2a369ec19030a65a23a9a44 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl @@ -1,42 +1,37 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME: ':'__hlsl_resource_t' -struct MyBuffer { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; -}; - -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:49 res '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] -// CHECK-SAME: ':'__hlsl_resource_t' -__hlsl_resource_t [[hlsl::resource_class(SRV)]] res; - -// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:6 f 'void () -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:55 r '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] -// CHECK-SAME: ':'__hlsl_resource_t' -void f() { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r; -} - -// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 7]]:29 MyBuffer2 -// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:19 typename depth 0 index 0 T -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 5]]:29 struct MyBuffer2 definition -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME: ':'__hlsl_resource_t' -template struct MyBuffer2 { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; -}; - -// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} line:[[# @LINE - 4]]:29 struct MyBuffer2 definition implicit_instantiation -// CHECK: TemplateArgument type 'float' -// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME: ':'__hlsl_resource_t' -MyBuffer2 myBuffer2; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; +}; + +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:49 res '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] +__hlsl_resource_t [[hlsl::resource_class(SRV)]] res; + +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 3]]:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:55 r '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] +void f() { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r; +} + +// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 6]]:29 MyBuffer2 +// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:19 typename depth 0 index 0 T +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:29 struct MyBuffer2 definition +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +template struct MyBuffer2 { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; +}; + +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} line:[[# @LINE - 4]]:29 struct MyBuffer2 definition implicit_instantiation +// CHECK: TemplateArgument type 'float' +// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +MyBuffer2 myBuffer2; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl index 63e39daff949b4a7291681c7ed8732a370aa9aff..a0a4da1dc2bf4464400cdd71ad08e9405545c4ae 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl @@ -1,22 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify - -// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}} -[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0; - -// expected-error@+1{{'resource_class' attribute takes one argument}} -__hlsl_resource_t [[hlsl::resource_class()]] e1; - -// expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}} -__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2; - -// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}} -__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3; - -// expected-warning@+1{{attribute 'resource_class' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4; - -// expected-error@+1{{'resource_class' attribute takes one argument}} -__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5; - -// expected-error@+1{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] e6; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify + +// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}} +[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0; + +// expected-error@+1{{'resource_class' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class()]] e1; + +// expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}} +__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2; + +// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}} +__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3; + +// expected-warning@+1{{attribute 'resource_class' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4; + +// expected-error@+1{{'resource_class' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5; + +// expected-error@+1{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] e6; diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl index e7d19c3da7216d78a01b8eda94dbb2029e206f27..8885e39237357d24395d28011209710ab9ab9836 100644 --- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl @@ -1,23 +1,21 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RWBuffer definition implicit_instantiation -// CHECK: -TemplateArgument type 'float' -// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-SAME: ':'__hlsl_resource_t' -// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer -RWBuffer Buffer1; - -// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RasterizerOrderedBuffer definition implicit_instantiation -// CHECK: -TemplateArgument type 'vector' -// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector' 4 -// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector)]] -// CHECK-SAME: ':'__hlsl_resource_t' -// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer -RasterizerOrderedBuffer > BufferArray3[4]; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RWBuffer definition implicit_instantiation +// CHECK: -TemplateArgument type 'float' +// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] +// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer +RWBuffer Buffer1; + +// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RasterizerOrderedBuffer definition implicit_instantiation +// CHECK: -TemplateArgument type 'vector' +// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector' 4 +// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector)]] +// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer +RasterizerOrderedBuffer > BufferArray3[4]; diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c index 7d29e45c7d5ac6bda6f70cadaf92d18555b46597..8e539a2a1faf8319f290cd954c05e72450136740 100644 --- a/clang/test/Preprocessor/predefined-win-macros.c +++ b/clang/test/Preprocessor/predefined-win-macros.c @@ -56,7 +56,12 @@ // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \ // RUN: -fms-compatibility-version=19.00 -std=c++23 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2B // CHECK-MS-CPP2B: #define _MSC_VER 1900 -// CHECK-MS-CPP2B: #define _MSVC_LANG 202004L +// CHECK-MS-CPP2B: #define _MSVC_LANG 202302L + +// RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \ +// RUN: -fms-compatibility-version=19.00 -std=c++26 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2C +// CHECK-MS-CPP2C: #define _MSC_VER 1900 +// CHECK-MS-CPP2C: #define _MSVC_LANG 202400L // RUN: %clang_cc1 -triple i386-windows %s -E -dM -o - \ // RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-X86-WIN diff --git a/clang/test/Sema/aarch64-neon-target.c b/clang/test/Sema/aarch64-neon-target.c index fd84b7f2eb00a11f977293c697ef8b0c3e932c17..07d763ec84bd12d1ce172c1480da38c332ea8265 100644 --- a/clang/test/Sema/aarch64-neon-target.c +++ b/clang/test/Sema/aarch64-neon-target.c @@ -58,7 +58,7 @@ __attribute__((target("arch=armv8.3-a+fp16"))) void test_v83(float32x4_t v4f32, float16x4_t v4f16, float64x2_t v2f64) { vcaddq_rot90_f32(v4f32, v4f32); vcmla_rot90_f16(v4f16, v4f16, v4f16); - vcmlaq_rot270_laneq_f64(v2f64, v2f64, v2f64, 1); + vcmlaq_rot270_f64(v2f64, v2f64, v2f64); } __attribute__((target("arch=armv8.5-a"))) @@ -95,7 +95,7 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t // 8.3 - complex vcaddq_rot90_f32(v4f32, v4f32); // expected-error {{always_inline function 'vcaddq_rot90_f32' requires target feature 'v8.3a'}} vcmla_rot90_f16(v4f16, v4f16, v4f16); // expected-error {{always_inline function 'vcmla_rot90_f16' requires target feature 'v8.3a'}} - vcmlaq_rot270_laneq_f64(v2f64, v2f64, v2f64, 1); // expected-error {{always_inline function 'vcmlaq_rot270_f64' requires target feature 'v8.3a'}} + vcmlaq_rot270_f64(v2f64, v2f64, v2f64); // expected-error {{always_inline function 'vcmlaq_rot270_f64' requires target feature 'v8.3a'}} // 8.5 - frint vrnd32xq_f32(v4f32); // expected-error {{always_inline function 'vrnd32xq_f32' requires target feature 'v8.5a'}} diff --git a/clang/test/Sema/aarch64-sve-types.c b/clang/test/Sema/aarch64-sve-types.c index 4525e71de6ebae23c0fe0693fbea21536b2af59a..8eed11675a69d0f60d629140ac02a280e1b43dbb 100644 --- a/clang/test/Sema/aarch64-sve-types.c +++ b/clang/test/Sema/aarch64-sve-types.c @@ -37,6 +37,9 @@ void f(void) { int size_bf16[sizeof(__SVBfloat16_t) == 0 ? 1 : -1]; // expected-error {{invalid application of 'sizeof' to sizeless type '__SVBfloat16_t'}} int align_bf16[__alignof__(__SVBfloat16_t) == 16 ? 1 : -1]; // expected-error {{invalid application of '__alignof' to sizeless type '__SVBfloat16_t'}} + int size_mf8[sizeof(__SVMfloat8_t) == 0 ? 1 : -1]; // expected-error {{invalid application of 'sizeof' to sizeless type '__SVMfloat8_t'}} + int align_mf8[__alignof__(__SVMfloat8_t) == 16 ? 1 : -1]; // expected-error {{invalid application of '__alignof' to sizeless type '__SVMfloat8_t'}} + int size_b8[sizeof(__SVBool_t) == 0 ? 1 : -1]; // expected-error {{invalid application of 'sizeof' to sizeless type '__SVBool_t'}} int align_b8[__alignof__(__SVBool_t) == 2 ? 1 : -1]; // expected-error {{invalid application of '__alignof' to sizeless type '__SVBool_t'}} } diff --git a/clang/test/Sema/aarch64-sve-vector-trig-ops.c b/clang/test/Sema/aarch64-sve-vector-trig-ops.c index 3fe6834be2e0b7f0e395787152e40a4bcbf02d21..f853abcd3379fad61a59441974797051f783d792 100644 --- a/clang/test/Sema/aarch64-sve-vector-trig-ops.c +++ b/clang/test/Sema/aarch64-sve-vector-trig-ops.c @@ -1,65 +1,65 @@ -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \ -// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify -// REQUIRES: aarch64-registered-target - -#include - -svfloat32_t test_asin_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_asin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_acos_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_acos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_atan_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_atan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_atan2_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_atan2(v, v); - // expected-error@-1 {{1st argument must be a floating point type}} -} - -svfloat32_t test_sin_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_sin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_cos_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_cos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_tan_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_tan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_sinh_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_sinh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_cosh_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_cosh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_tanh_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_tanh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \ +// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify +// REQUIRES: aarch64-registered-target + +#include + +svfloat32_t test_asin_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_asin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_acos_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_acos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_atan_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_atan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_atan2_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_atan2(v, v); + // expected-error@-1 {{1st argument must be a floating point type}} +} + +svfloat32_t test_sin_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_sin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_cos_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_cos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_tan_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_tan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_sinh_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_sinh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_cosh_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_cosh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_tanh_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_tanh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} diff --git a/clang/test/Sema/aarch64-vcmla-undef.c b/clang/test/Sema/aarch64-vcmla-undef.c new file mode 100644 index 0000000000000000000000000000000000000000..8a777ff615633941137c99c7e6d370d1ed4f1862 --- /dev/null +++ b/clang/test/Sema/aarch64-vcmla-undef.c @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.3a -ffreestanding -fsyntax-only -verify -verify-ignore-unexpected=note %s + +// REQUIRES: aarch64-registered-target + +#include + +void test(float64x1_t v1f64, float64x2_t v2f64) { + vcmla_f64(v1f64, v1f64, v1f64); // expected-error {{call to undeclared function 'vcmla_f64'}} + vcmla_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_lane_f64'}} + vcmla_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_laneq_f64'}} + vcmlaq_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_lane_f64'}} + vcmlaq_laneq_f64(v2f64, v2f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_laneq_f64'}} + + vcmla_rot90_f64(v1f64, v1f64, v1f64); // expected-error {{call to undeclared function 'vcmla_rot90_f64'}} + vcmla_rot90_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_rot90_lane_f64'}} + vcmla_rot90_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_rot90_laneq_f64'}} + vcmlaq_rot90_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot90_lane_f64'}} + vcmlaq_rot90_laneq_f64(v2f64, v2f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot90_laneq_f64'}} + + vcmla_rot180_f64(v1f64, v1f64, v1f64); // expected-error {{call to undeclared function 'vcmla_rot180_f64'}} + vcmla_rot180_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_rot180_lane_f64'}} + vcmla_rot180_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_rot180_laneq_f64'}} + vcmlaq_rot180_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot180_lane_f64'}} + vcmlaq_rot180_laneq_f64(v2f64, v2f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot180_laneq_f64'}} + + vcmla_rot270_f64(v1f64, v1f64, v1f64); // expected-error {{call to undeclared function 'vcmla_rot270_f64'}} + vcmla_rot270_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_rot270_lane_f64'}} + vcmla_rot270_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_rot270_laneq_f64'}} + vcmlaq_rot270_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot270_lane_f64'}} + vcmlaq_rot270_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot270_laneq_f64'}} +} diff --git a/clang/test/Sema/arm-mfp8.cpp b/clang/test/Sema/arm-mfp8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b1509c542473a5eed93f7857c032642afc943378 --- /dev/null +++ b/clang/test/Sema/arm-mfp8.cpp @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -fsyntax-only -verify=sve -triple aarch64-arm-none-eabi \ +// RUN: -target-feature -fp8 -target-feature +sve %s + +// REQUIRES: aarch64-registered-target + +#include +void test_vector_sve(svmfloat8_t a, svuint8_t c) { + a + c; // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}} + a - c; // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}} + a * c; // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}} + a / c; // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}} +} + diff --git a/clang/test/Sema/constexpr.c b/clang/test/Sema/constexpr.c index 0cf9491c4a42bf76795c7ac6c3ecfa86dfae49d6..eaa000b3b97758f508045f0f743e76579e973491 100644 --- a/clang/test/Sema/constexpr.c +++ b/clang/test/Sema/constexpr.c @@ -367,3 +367,10 @@ struct S10 { constexpr struct S10 c = { 255 }; // FIXME-expected-error@-1 {{constexpr initializer evaluates to 255 which is not exactly representable in 'long long' bit-field with width 8}} // See: GH#101299 + +void constexprif() { + if constexpr (300) {} //expected-error {{expected '(' after 'if'}} +} +void constevalif() { + if consteval (300) {} //expected-error {{expected '(' after 'if'}} +} diff --git a/clang/test/Sema/riscv-rvv-vector-trig-ops.c b/clang/test/Sema/riscv-rvv-vector-trig-ops.c index 0aed1b2a09986550c569f6622deee56aecc512df..006c136f80332c1291a61478d5bd16531b269acb 100644 --- a/clang/test/Sema/riscv-rvv-vector-trig-ops.c +++ b/clang/test/Sema/riscv-rvv-vector-trig-ops.c @@ -1,67 +1,67 @@ -// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \ -// RUN: -target-feature +v -target-feature +zfh -target-feature +zvfh \ -// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify -// REQUIRES: riscv-registered-target - -#include - -vfloat32mf2_t test_asin_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_asin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_acos_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_acos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_atan_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_atan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - -vfloat32mf2_t test_atan2_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_atan2(v, v); - // expected-error@-1 {{1st argument must be a floating point type}} -} - -vfloat32mf2_t test_sin_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_sin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -vfloat32mf2_t test_cos_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_cos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -vfloat32mf2_t test_tan_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_tan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -vfloat32mf2_t test_sinh_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_sinh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_cosh_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_cosh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_tanh_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_tanh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - +// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \ +// RUN: -target-feature +v -target-feature +zfh -target-feature +zvfh \ +// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify +// REQUIRES: riscv-registered-target + +#include + +vfloat32mf2_t test_asin_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_asin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_acos_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_acos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_atan_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_atan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + +vfloat32mf2_t test_atan2_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_atan2(v, v); + // expected-error@-1 {{1st argument must be a floating point type}} +} + +vfloat32mf2_t test_sin_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_sin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +vfloat32mf2_t test_cos_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_cos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +vfloat32mf2_t test_tan_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_tan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +vfloat32mf2_t test_sinh_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_sinh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_cosh_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_cosh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_tanh_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_tanh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp index bdc58171917375120614a319405be60f15d24d99..d04bbb32433fb50b39fde72ca5783ce81b28d2e3 100644 --- a/clang/test/SemaCXX/attr-lifetimebound.cpp +++ b/clang/test/SemaCXX/attr-lifetimebound.cpp @@ -107,6 +107,37 @@ namespace std { using std::operator""s; using std::operator""sv; +namespace default_args { + using IntArray = int[]; + const int *defaultparam1(const int &def1 [[clang::lifetimebound]] = 0); // #def1 + const int &defaultparam_array([[clang::lifetimebound]] const int *p = IntArray{1, 2, 3}); // #def2 + struct A { + A(const char *, const int &def3 [[clang::lifetimebound]] = 0); // #def3 + }; + const int &defaultparam2(const int &def4 [[clang::lifetimebound]] = 0); // #def4 + const int &defaultparam3(const int &def5 [[clang::lifetimebound]] = defaultparam2(), const int &def6 [[clang::lifetimebound]] = 0); // #def5 #def6 + std::string_view defaultparam4(std::string_view s [[clang::lifetimebound]] = std::string()); // #def7 + + const int *test_default_args() { + const int *c = defaultparam1(); // expected-warning {{temporary whose address is used as value of local variable 'c' will be destroyed at the end of the full-expression}} expected-note@#def1 {{initializing parameter 'def1' with default argument}} + A a = A(""); // expected-warning {{temporary whose address is used as value of local variable 'a' will be destroyed at the end of the full-expression}} expected-note@#def3 {{initializing parameter 'def3' with default argument}} + const int &s = defaultparam2(); // expected-warning {{temporary bound to local reference 's' will be destroyed at the end of the full-expression}} expected-note@#def4 {{initializing parameter 'def4' with default argument}} + const int &t = defaultparam3(); // expected-warning {{temporary bound to local reference 't' will be destroyed at the end of the full-expression}} expected-note@#def4 {{initializing parameter 'def4' with default argument}} expected-note@#def5 {{initializing parameter 'def5' with default argument}} expected-warning {{temporary bound to local reference 't' will be destroyed at the end of the full-expression}} expected-note@#def6 {{initializing parameter 'def6' with default argument}} + const int &u = defaultparam_array(); // expected-warning {{temporary bound to local reference 'u' will be destroyed at the end of the full-expression}} expected-note@#def2 {{initializing parameter 'p' with default argument}} + int local; + const int &v = defaultparam2(local); // no warning + const int &w = defaultparam2(1); // expected-warning {{temporary bound to local reference 'w' will be destroyed at the end of the full-expression}} + if (false) { + return &defaultparam2(); // expected-warning {{returning address of local temporary object}} + } + if (false) { + return &defaultparam2(0); // expected-warning {{returning address of local temporary object}} expected-note@#def4 {{initializing parameter 'def4' with default argument}} + } + std::string_view sv = defaultparam4(); // expected-warning {{temporary whose address is used as value of local variable 'sv' will be destroyed at the end of the full-expression}} expected-note@#def7 {{initializing parameter 's' with default argument}} + return nullptr; + } +} // namespace default_args + namespace p0936r0_examples { std::string_view s = "foo"s; // expected-warning {{temporary}} diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp index 7520b43a194aba6860febbac01f0ad2858abbaa0..5ddb77b35ff145edb370480a9520593830dc6aab 100644 --- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp +++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp @@ -511,3 +511,19 @@ constexpr bool9 bad_short_to_bool9 = __builtin_bit_cast(bool9, static_cast(0xCAFEBABE0C05FEFEULL), ""); + static_assert(bit_cast(test_int_complex) == (LITTLE_END + ? 0xCAFEBABE0C05FEFE + : 0x0C05FEFECAFEBABE), ""); + static_assert(sizeof(double) == 2 * sizeof(float)); + struct TwoFloats { float A; float B; }; + constexpr _Complex float test_float_complex = {1.0f, 2.0f}; + constexpr TwoFloats TF = __builtin_bit_cast(TwoFloats, test_float_complex); + static_assert(TF.A == 1.0f && TF.B == 2.0f); + + constexpr double D = __builtin_bit_cast(double, test_float_complex); + constexpr int M = __builtin_bit_cast(int, test_int_complex); // expected-error {{__builtin_bit_cast source size does not equal destination size}} +} diff --git a/clang/test/SemaCXX/typeid-ref.cpp b/clang/test/SemaCXX/typeid-ref.cpp index f788b04077ecac793013a1d31e454977f11072ac..025816c42512e14445c9b0624f1f310e25d6de9f 100644 --- a/clang/test/SemaCXX/typeid-ref.cpp +++ b/clang/test/SemaCXX/typeid-ref.cpp @@ -6,7 +6,7 @@ namespace std { struct X { }; void f() { - // CHECK: @_ZTS1X = linkonce_odr {{(dso_local |hidden )?}}constant // CHECK: @_ZTI1X = linkonce_odr {{(dso_local |hidden )?}}constant + // CHECK: @_ZTS1X = linkonce_odr {{(dso_local |hidden )?}}constant (void)typeid(X&); } diff --git a/clang/test/SemaCXX/virtual-override.cpp b/clang/test/SemaCXX/virtual-override.cpp index 72abfc3cf51e1f73ecee412f84c65f4cd0dfe0bd..ce6dd35e0b56fa3fb2e846cc68098228c627332a 100644 --- a/clang/test/SemaCXX/virtual-override.cpp +++ b/clang/test/SemaCXX/virtual-override.cpp @@ -19,10 +19,12 @@ struct b { }; class A { virtual a* f(); // expected-note{{overridden virtual function is here}} + virtual int *g(); // expected-note{{overridden virtual function is here}} }; class B : A { virtual b* f(); // expected-error{{return type of virtual function 'f' is not covariant with the return type of the function it overrides ('b *' is not derived from 'a *')}} + virtual char *g(); // expected-error{{virtual function 'g' has a different return type ('char *') than the function it overrides (which has return type 'int *')}} }; } @@ -81,13 +83,53 @@ namespace T6 { struct a { }; class A { - virtual const a* f(); - virtual a* g(); // expected-note{{overridden virtual function is here}} + // Classes. + virtual const a* const_vs_unqualified_class(); + virtual a* unqualified_vs_const_class(); // expected-note{{overridden virtual function is here}} + + virtual volatile a* volatile_vs_unqualified_class(); + virtual a* unqualified_vs_volatile_class(); // expected-note{{overridden virtual function is here}} + + virtual const a* const_vs_volatile_class(); // expected-note{{overridden virtual function is here}} + virtual volatile a* volatile_vs_const_class(); // expected-note{{overridden virtual function is here}} + + virtual const volatile a* const_volatile_vs_const_class(); + virtual const a* const_vs_const_volatile_class(); // expected-note{{overridden virtual function is here}} + + virtual const volatile a* const_volatile_vs_volatile_class(); + virtual volatile a* volatile_vs_const_volatile_class(); // expected-note{{overridden virtual function is here}} + + virtual const volatile a* const_volatile_vs_unualified_class(); + virtual a* unqualified_vs_const_volatile_class(); // expected-note{{overridden virtual function is here}} + + // Non Classes. + virtual const int* const_vs_unqualified_non_class(); // expected-note{{overridden virtual function is here}} + virtual int* unqualified_vs_const_non_class(); // expected-note{{overridden virtual function is here}} }; class B : A { - virtual a* f(); - virtual const a* g(); // expected-error{{return type of virtual function 'g' is not covariant with the return type of the function it overrides (class type 'const a *' is more qualified than class type 'a *'}} + // Classes. + a* const_vs_unqualified_class() override; + const a* unqualified_vs_const_class() override; // expected-error{{return type of virtual function 'unqualified_vs_const_class' is not covariant with the return type of the function it overrides (class type 'const a *' does not have the same cv-qualification as or less cv-qualification than class type 'a *')}} + + a* volatile_vs_unqualified_class() override; + volatile a* unqualified_vs_volatile_class() override; // expected-error{{return type of virtual function 'unqualified_vs_volatile_class' is not covariant with the return type of the function it overrides (class type 'volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'a *')}} + + volatile a* const_vs_volatile_class() override; // expected-error{{return type of virtual function 'const_vs_volatile_class' is not covariant with the return type of the function it overrides (class type 'volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'const a *')}} + const a* volatile_vs_const_class() override; // expected-error{{return type of virtual function 'volatile_vs_const_class' is not covariant with the return type of the function it overrides (class type 'const a *' does not have the same cv-qualification as or less cv-qualification than class type 'volatile a *')}} + + const a* const_volatile_vs_const_class() override; + const volatile a* const_vs_const_volatile_class() override; // expected-error{{return type of virtual function 'const_vs_const_volatile_class' is not covariant with the return type of the function it overrides (class type 'const volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'const a *')}} + + volatile a* const_volatile_vs_volatile_class() override; + const volatile a* volatile_vs_const_volatile_class() override; // expected-error{{return type of virtual function 'volatile_vs_const_volatile_class' is not covariant with the return type of the function it overrides (class type 'const volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'volatile a *')}} + + a* const_volatile_vs_unualified_class() override; + const volatile a* unqualified_vs_const_volatile_class() override; // expected-error{{return type of virtual function 'unqualified_vs_const_volatile_class' is not covariant with the return type of the function it overrides (class type 'const volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'a *')}} + + // Non Classes. + int* const_vs_unqualified_non_class() override; // expected-error{{virtual function 'const_vs_unqualified_non_class' has a different return type ('int *') than the function it overrides (which has return type 'const int *')}} + const int* unqualified_vs_const_non_class() override; // expected-error{{virtual function 'unqualified_vs_const_non_class' has a different return type ('const int *') than the function it overrides (which has return type 'int *')}} }; } diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp index 08707d7ff545d8c48c3e4a1db943cb91d3edc1d9..0228e42652bd95eb8101d7e60252455ba4556cb7 100644 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp @@ -32,38 +32,68 @@ void foo(int *p){} namespace std{ template class span { - T *elements; + T *elements; - span(T *, unsigned){} + span(T *, unsigned){} - public: + public: - constexpr span subspan(size_t offset, size_t count) const { - return span (elements+offset, count); // expected-warning{{unsafe pointer arithmetic}} - } + constexpr span subspan(size_t offset, size_t count) const { + return span (elements+offset, count); // expected-warning{{unsafe pointer arithmetic}} + } - constexpr T* data() const noexcept { - return elements; - } + constexpr T* data() const noexcept { + return elements; + } + + constexpr T* hello() const noexcept { + return elements; + } + }; + + template class vector { + + T *elements; + + public: + + vector(size_t n) { + elements = new T[n]; + } + + constexpr T* data() const noexcept { + return elements; + } + + ~vector() { + delete[] elements; + } + }; + + template + class array { + T elements[N]; + + public: + + constexpr const T* data() const noexcept { + return elements; + } + + }; - - constexpr T* hello() const noexcept { - return elements; - } -}; - template class span_duplicate { - span_duplicate(T *, unsigned){} + span_duplicate(T *, unsigned){} - T array[10]; + T array[10]; - public: + public: - T* data() { - return array; - } + T* data() { + return array; + } -}; + }; } using namespace std; @@ -89,21 +119,28 @@ void cast_without_data(int *ptr) { float *p = (float*) ptr; } -void warned_patterns(std::span span_ptr, std::span base_span, span span_without_qual) { - A *a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}} - a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}} +void warned_patterns_span(std::span span_ptr, std::span base_span, span span_without_qual) { + A *a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of 'data'}} + a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of 'data'}} - a1 = (A*)(span_ptr.data()); // expected-warning{{unsafe invocation of span::data}} - A *a2 = (A*) (span_without_qual.data()); // expected-warning{{unsafe invocation of span::data}} + a1 = (A*)(span_ptr.data()); // expected-warning{{unsafe invocation of 'data'}} + A *a2 = (A*) (span_without_qual.data()); // expected-warning{{unsafe invocation of 'data'}} - a2 = (A*) span_without_qual.data(); // expected-warning{{unsafe invocation of span::data}} + a2 = (A*) span_without_qual.data(); // expected-warning{{unsafe invocation of 'data'}} // TODO:: Should we warn when we cast from base to derived type? - Derived *b = dynamic_cast (base_span.data());// expected-warning{{unsafe invocation of span::data}} + Derived *b = dynamic_cast (base_span.data());// expected-warning{{unsafe invocation of 'data'}} // TODO:: This pattern is safe. We can add special handling for it, if we decide this // is the recommended fixit for the unsafe invocations. - A *a3 = (A*)span_ptr.subspan(0, sizeof(A)).data(); // expected-warning{{unsafe invocation of span::data}} + A *a3 = (A*)span_ptr.subspan(0, sizeof(A)).data(); // expected-warning{{unsafe invocation of 'data'}} +} + +void warned_patterns_array(std::array array_ptr, std::array base_span, span span_without_qual) { + const A *a1 = (A*)array_ptr.data(); // expected-warning{{unsafe invocation of 'data'}} + a1 = (A*)array_ptr.data(); // expected-warning{{unsafe invocation of 'data'}} + + a1 = (A*)(array_ptr.data()); // expected-warning{{unsafe invocation of 'data'}} } void not_warned_patterns(std::span span_ptr, std::span base_span) { diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl index 764b9e843f7f1c0aaff5b1ef7507802926ac2022..b60fba62bdb000ab788fafdd29f667ab7d29bbee 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl @@ -1,119 +1,119 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ -// RUN: -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ +// RUN: -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl index 6bfc8577670cc718a1158b46238fb810e0d0c969..35b7c384f26cdd4a3a9e44f525fa1bd25bb116a6 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl @@ -1,180 +1,180 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -// Exported function without body, not used -export void exportedFunctionUnused(float f); - -// Exported function with body, without export, not used -void exportedFunctionUnused(float f) { - // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUnused_fx_call - - // API with shader-stage-specific availability in unused exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(f); - float C = fz(f); -} - -// Exported function with body - called from main() which is a compute shader entry point -export void exportedFunctionUsed(float f) { - // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUsed_fx_call - - // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #exportedFunctionUsed_fy_call - - // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #exportedFunctionUsed_fz_call -} - -namespace A { - namespace B { - export { - void exportedFunctionInNS(float x) { - // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(x); // #exportedFunctionInNS_fx_call - - // API with shader-stage-specific availability in exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(x); - float C = fz(x); - } - } - } -} - -// Shader entry point without body -[shader("compute")] -[numthreads(4,1,1)] -float main(); - -// Shader entry point with body -[shader("compute")] -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - exportedFunctionUsed(1.0f); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +// Exported function without body, not used +export void exportedFunctionUnused(float f); + +// Exported function with body, without export, not used +void exportedFunctionUnused(float f) { + // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUnused_fx_call + + // API with shader-stage-specific availability in unused exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(f); + float C = fz(f); +} + +// Exported function with body - called from main() which is a compute shader entry point +export void exportedFunctionUsed(float f) { + // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUsed_fx_call + + // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #exportedFunctionUsed_fy_call + + // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #exportedFunctionUsed_fz_call +} + +namespace A { + namespace B { + export { + void exportedFunctionInNS(float x) { + // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(x); // #exportedFunctionInNS_fx_call + + // API with shader-stage-specific availability in exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(x); + float C = fz(x); + } + } + } +} + +// Shader entry point without body +[shader("compute")] +[numthreads(4,1,1)] +float main(); + +// Shader entry point with body +[shader("compute")] +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + exportedFunctionUsed(1.0f); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl index 65836c55821d77c72fa783bd88a7bae79ca0b723..40687983839303a4ba7a052bf08db9a8c532ca29 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl @@ -1,119 +1,119 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ -// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - return 0; -} - -float alive(float f) { - // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - // expected-warning@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ +// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + return 0; +} + +float alive(float f) { + // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + // expected-warning@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl index 4c9783138f670197b0dbb30b0444194e440a547f..a23e91a546b167c3c33015e5c04635ea84505ad6 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl @@ -1,162 +1,162 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - - // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - - // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - - return 0; -} - -float alive(float f) { - // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - - // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - - // expected-warning@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -// Exported function without body, not used -export void exportedFunctionUnused(float f); - -// Exported function with body, without export, not used -void exportedFunctionUnused(float f) { - // expected-warning@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUnused_fx_call - - // API with shader-stage-specific availability in unused exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(f); - float C = fz(f); -} - -// Exported function with body - called from main() which is a compute shader entry point -export void exportedFunctionUsed(float f) { - // expected-warning@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUsed_fx_call - - // expected-warning@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #exportedFunctionUsed_fy_call - - // expected-warning@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #exportedFunctionUsed_fz_call -} - -// Shader entry point without body -[shader("compute")] -[numthreads(4,1,1)] -float main(); - -// Shader entry point with body -[shader("compute")] -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - exportedFunctionUsed(1.0f); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + + // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + + // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + + return 0; +} + +float alive(float f) { + // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + + // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + + // expected-warning@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +// Exported function without body, not used +export void exportedFunctionUnused(float f); + +// Exported function with body, without export, not used +void exportedFunctionUnused(float f) { + // expected-warning@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUnused_fx_call + + // API with shader-stage-specific availability in unused exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(f); + float C = fz(f); +} + +// Exported function with body - called from main() which is a compute shader entry point +export void exportedFunctionUsed(float f) { + // expected-warning@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUsed_fx_call + + // expected-warning@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #exportedFunctionUsed_fy_call + + // expected-warning@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #exportedFunctionUsed_fz_call +} + +// Shader entry point without body +[shader("compute")] +[numthreads(4,1,1)] +float main(); + +// Shader entry point with body +[shader("compute")] +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + exportedFunctionUsed(1.0f); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl index b67e10c9a9017ac3f7a96d7f9504a3c4b87850f1..a8783c10cbabca68f5187a53a32a0f9df67cf24d 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl @@ -1,129 +1,129 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ -// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_dead_fx_call - // expected-error@#also_dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_dead_fy_call - // expected-error@#also_dead_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_dead_fz_call - return 0; -} - -float dead(float f) { - // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #dead_fx_call - // expected-error@#dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #dead_fy_call - // expected-error@#dead_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #dead_fz_call - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -float test(float x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - } -}; - -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - return a * b * c; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ +// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_dead_fx_call + // expected-error@#also_dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_dead_fy_call + // expected-error@#also_dead_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_dead_fz_call + return 0; +} + +float dead(float f) { + // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #dead_fx_call + // expected-error@#dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #dead_fy_call + // expected-error@#dead_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #dead_fz_call + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +float test(float x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + } +}; + +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + return a * b * c; } \ No newline at end of file diff --git a/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl index c7be5afbc2d22f8d603d39de6ab5a1a2b6da7cb9..0fffbc96dac19440691f80030ba9532493e8e855 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl @@ -1,192 +1,192 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -// FIXME: all diagnostics marked as FUTURE will come alive when HLSL default -// diagnostic mode is implemented in a future PR which will verify calls in -// all functions that are reachable from the shader library entry points - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_dead_fx_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float B = fy(f); // #also_dead_fy_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float C = fz(f); // #also_dead_fz_call - return 0; -} - -float dead(float f) { - // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #dead_fx_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float B = fy(f); // #dead_fy_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float C = fz(f); // #dead_fz_call - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -float test(float x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - } -}; - -// Exported function without body, not used -export void exportedFunctionUnused(float f); - -// Exported function with body, without export, not used -void exportedFunctionUnused(float f) { - // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUnused_fx_call - - // API with shader-stage-specific availability in unused exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(f); - float C = fz(f); -} - -// Exported function with body - called from main() which is a compute shader entry point -export void exportedFunctionUsed(float f) { - // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUsed_fx_call - - // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #exportedFunctionUsed_fy_call - - // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #exportedFunctionUsed_fz_call -} - -namespace A { - namespace B { - export { - void exportedFunctionInNS(float x) { - // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(x); // #exportedFunctionInNS_fx_call - - // API with shader-stage-specific availability in exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(x); - float C = fz(x); - } - } - } -} - -[shader("compute")] -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f);float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - exportedFunctionUsed(1.0f); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +// FIXME: all diagnostics marked as FUTURE will come alive when HLSL default +// diagnostic mode is implemented in a future PR which will verify calls in +// all functions that are reachable from the shader library entry points + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_dead_fx_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float B = fy(f); // #also_dead_fy_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float C = fz(f); // #also_dead_fz_call + return 0; +} + +float dead(float f) { + // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #dead_fx_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float B = fy(f); // #dead_fy_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float C = fz(f); // #dead_fz_call + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +float test(float x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + } +}; + +// Exported function without body, not used +export void exportedFunctionUnused(float f); + +// Exported function with body, without export, not used +void exportedFunctionUnused(float f) { + // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUnused_fx_call + + // API with shader-stage-specific availability in unused exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(f); + float C = fz(f); +} + +// Exported function with body - called from main() which is a compute shader entry point +export void exportedFunctionUsed(float f) { + // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUsed_fx_call + + // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #exportedFunctionUsed_fy_call + + // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #exportedFunctionUsed_fz_call +} + +namespace A { + namespace B { + export { + void exportedFunctionInNS(float x) { + // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(x); // #exportedFunctionInNS_fx_call + + // API with shader-stage-specific availability in exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(x); + float C = fz(x); + } + } + } +} + +[shader("compute")] +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f);float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + exportedFunctionUsed(1.0f); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl index b56ab8fe4526ba896c8a20d97b6bc52496d69798..bfefc9b116a64f9e701f6e924b57b7bdb98744a9 100644 --- a/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl @@ -1,57 +1,57 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = compute))) -float fz(float); // #fz - - -void F(float f) { - // Make sure we only get this error once, even though this function is scanned twice - once - // in compute shader context and once in pixel shader context. - // expected-error@#fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #fx_call - - // expected-error@#fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #fy_call - - // expected-error@#fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 5.0 in compute environment here, but the deployment target is Shader Model 6.0 pixel environment}} - float X = fz(f); // #fz_call -} - -void deadCode(float f) { - // no diagnostics expected under default diagnostic mode - float A = fx(f); - float B = fy(f); - float X = fz(f); -} - -// Pixel shader -[shader("pixel")] -void mainPixel() { - F(1.0); -} - -// First Compute shader -[shader("compute")] -[numthreads(4,1,1)] -void mainCompute1() { - F(2.0); -} - -// Second compute shader to make sure we do not get duplicate messages if F is called -// from multiple entry points. -[shader("compute")] -[numthreads(4,1,1)] -void mainCompute2() { - F(3.0); -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = compute))) +float fz(float); // #fz + + +void F(float f) { + // Make sure we only get this error once, even though this function is scanned twice - once + // in compute shader context and once in pixel shader context. + // expected-error@#fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #fx_call + + // expected-error@#fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #fy_call + + // expected-error@#fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 5.0 in compute environment here, but the deployment target is Shader Model 6.0 pixel environment}} + float X = fz(f); // #fz_call +} + +void deadCode(float f) { + // no diagnostics expected under default diagnostic mode + float A = fx(f); + float B = fy(f); + float X = fz(f); +} + +// Pixel shader +[shader("pixel")] +void mainPixel() { + F(1.0); +} + +// First Compute shader +[shader("compute")] +[numthreads(4,1,1)] +void mainCompute1() { + F(2.0); +} + +// Second compute shader to make sure we do not get duplicate messages if F is called +// from multiple entry points. +[shader("compute")] +[numthreads(4,1,1)] +void mainCompute2() { + F(3.0); +} diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl index a472d5519dc51fcccf166f62948fe400109335d8..1ec56542113d90b9df78fb5e2248cc06b6c7697f 100644 --- a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl @@ -1,19 +1,19 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s - -typedef vector float3; - -StructuredBuffer Buffer; - -// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}} -// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} -StructuredBuffer BufferErr1; - -// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}} -// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} -StructuredBuffer<> BufferErr2; - -[numthreads(1,1,1)] -void main() { - (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer>'}} - // expected-note@* {{implicitly declared private here}} -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s + +typedef vector float3; + +StructuredBuffer Buffer; + +// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}} +// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} +StructuredBuffer BufferErr1; + +// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}} +// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} +StructuredBuffer<> BufferErr2; + +[numthreads(1,1,1)] +void main() { + (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer>'}} + // expected-note@* {{implicitly declared private here}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/WaveReadLaneAt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveReadLaneAt-errors.hlsl new file mode 100644 index 0000000000000000000000000000000000000000..ef8299b59ca73f36cb26bec4266509bd7d2f288e --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/WaveReadLaneAt-errors.hlsl @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify + +bool test_too_few_arg() { + return __builtin_hlsl_wave_read_lane_at(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} +} + +float2 test_too_few_arg_1(float2 p0) { + return __builtin_hlsl_wave_read_lane_at(p0); + // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} +} + +float2 test_too_many_arg(float2 p0) { + return __builtin_hlsl_wave_read_lane_at(p0, p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +float3 test_index_double_type_check(float3 p0, double idx) { + return __builtin_hlsl_wave_read_lane_at(p0, idx); + // expected-error@-1 {{passing 'double' to parameter of incompatible type 'unsigned int'}} +} + +float3 test_index_int3_type_check(float3 p0, int3 idxs) { + return __builtin_hlsl_wave_read_lane_at(p0, idxs); + // expected-error@-1 {{passing 'int3' (aka 'vector') to parameter of incompatible type 'unsigned int'}} +} + +struct S { float f; }; + +float3 test_index_S_type_check(float3 p0, S idx) { + return __builtin_hlsl_wave_read_lane_at(p0, idx); + // expected-error@-1 {{passing 'S' to parameter of incompatible type 'unsigned int'}} +} + +S test_expr_struct_type_check(S p0, int idx) { + return __builtin_hlsl_wave_read_lane_at(p0, idx); + // expected-error@-1 {{invalid operand of type 'S' where a scalar or vector is required}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl index 423f5bac9471f4d5934635c18f177d11bedd9d68..354e7abb8a31eb76512072b6902c318d77b0f017 100644 --- a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl @@ -1,43 +1,43 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify - -void test_too_few_arg() -{ - return __builtin_hlsl_cross(); - // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} -} - -void test_too_many_arg(float3 p0) -{ - return __builtin_hlsl_cross(p0, p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_cross(p1, p1); - // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_cross_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_cross(p1, p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_cross_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_cross(p1, p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} - -float2 builtin_cross_float2(float2 p1, float2 p2) -{ - return __builtin_hlsl_cross(p1, p2); - // expected-error@-1 {{too many elements in vector operand (expected 3 elements, have 2)}} -} - -float3 builtin_cross_float3_int3(float3 p1, int3 p2) -{ - return __builtin_hlsl_cross(p1, p2); - // expected-error@-1 {{all arguments to '__builtin_hlsl_cross' must have the same type}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify + +void test_too_few_arg() +{ + return __builtin_hlsl_cross(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} +} + +void test_too_many_arg(float3 p0) +{ + return __builtin_hlsl_cross(p0, p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_cross(p1, p1); + // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_cross_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_cross(p1, p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_cross_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_cross(p1, p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} + +float2 builtin_cross_float2(float2 p1, float2 p2) +{ + return __builtin_hlsl_cross(p1, p2); + // expected-error@-1 {{too many elements in vector operand (expected 3 elements, have 2)}} +} + +float3 builtin_cross_float3_int3(float3 p1, int3 p2) +{ + return __builtin_hlsl_cross(p1, p2); + // expected-error@-1 {{all arguments to '__builtin_hlsl_cross' must have the same type}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl index bfbd8b28257a3b9f75afd95218022a8c2e2372f8..b876a8e84cb3ac399adb76f128de056eb35179e8 100644 --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl @@ -1,13 +1,13 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow - -double test_double_builtin(double p0, double p1) { - return TEST_FUNC(p0, p1); - // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}} -} - -double2 test_vec_double_builtin(double2 p0, double2 p1) { - return TEST_FUNC(p0, p1); - // expected-error@-1 {{passing 'double2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow + +double test_double_builtin(double p0, double p1) { + return TEST_FUNC(p0, p1); + // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}} +} + +double2 test_vec_double_builtin(double2 p0, double2 p1) { + return TEST_FUNC(p0, p1); + // expected-error@-1 {{passing 'double2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl index 281faada6f5e9404a5478c22f826fb8262ff4835..c5e2ac0b502dc431812058782e9f7adeef2aba5e 100644 --- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl @@ -1,32 +1,32 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected - - -void test_too_few_arg() -{ - return __builtin_hlsl_length(); - // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} -} - -void test_too_many_arg(float2 p0) -{ - return __builtin_hlsl_length(p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_length(p1); - // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_length_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_length(p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_length_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_length(p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected + + +void test_too_few_arg() +{ + return __builtin_hlsl_length(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} +} + +void test_too_many_arg(float2 p0) +{ + return __builtin_hlsl_length(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_length(p1); + // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_length_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_length(p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_length_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_length(p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl index fc48c9b2589f7eff55c45d53d5ed8c8f144f517a..3720dca9b88a12246518ca8ca7a7692319e919e0 100644 --- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl @@ -1,31 +1,31 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected - -void test_too_few_arg() -{ - return __builtin_hlsl_normalize(); - // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} -} - -void test_too_many_arg(float2 p0) -{ - return __builtin_hlsl_normalize(p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_normalize(p1); - // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_normalize_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_normalize(p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_normalize_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_normalize(p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected + +void test_too_few_arg() +{ + return __builtin_hlsl_normalize(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} +} + +void test_too_many_arg(float2 p0) +{ + return __builtin_hlsl_normalize(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_normalize(p1); + // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_normalize_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_normalize(p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_normalize_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_normalize(p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl index 823585201ca62d6d32b57da2ec110983a507245a..a76c5ff5dbd2bab21f6b09c4b0465c8ae4b01d7a 100644 --- a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl @@ -1,31 +1,31 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected - -void test_too_few_arg() -{ - return __builtin_hlsl_step(); - // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} -} - -void test_too_many_arg(float2 p0) -{ - return __builtin_hlsl_step(p0, p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_step(p1, p1); - // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_step_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_step(p1, p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_step_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_step(p1, p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected + +void test_too_few_arg() +{ + return __builtin_hlsl_step(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} +} + +void test_too_many_arg(float2 p0) +{ + return __builtin_hlsl_step(p0, p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_step(p1, p1); + // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_step_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_step(p1, p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_step_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_step(p1, p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl index 8c0f8d6f271dbdde25dba4bac96cfb801c985d89..1223a131af35c4f37a8222885ceb69b678d0b049 100644 --- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl +++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl @@ -1,81 +1,81 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s -// expected-no-diagnostics - -_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), ""); -// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported - -_Static_assert(!__builtin_hlsl_is_intangible(int), ""); -_Static_assert(!__builtin_hlsl_is_intangible(float3), ""); -_Static_assert(!__builtin_hlsl_is_intangible(half[4]), ""); - -typedef __hlsl_resource_t Res; -_Static_assert(__builtin_hlsl_is_intangible(const Res), ""); -// no need to check array of Res, arrays of sizeless types are not supported - -struct ABuffer { - const int i[10]; - __hlsl_resource_t h; -}; -_Static_assert(__builtin_hlsl_is_intangible(ABuffer), ""); -_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), ""); - -struct MyStruct { - half2 h2; - int3 i3; -}; -_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), ""); -_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), ""); - -class MyClass { - int3 ivec; - float farray[12]; - MyStruct ms; - ABuffer buf; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyClass), ""); -_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), ""); - -union U { - double d[4]; - Res buf; -}; -_Static_assert(__builtin_hlsl_is_intangible(U), ""); -_Static_assert(__builtin_hlsl_is_intangible(U[100]), ""); - -class MyClass2 { - int3 ivec; - float farray[12]; - U u; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyClass2), ""); -_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), ""); - -class Simple { - int a; -}; - -template struct TemplatedBuffer { - T a; - __hlsl_resource_t h; -}; -_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer), ""); - -struct MyStruct2 : TemplatedBuffer { - float x; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), ""); - -struct MyStruct3 { - const TemplatedBuffer TB[10]; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), ""); - -template struct SimpleTemplate { - T a; -}; -_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), ""); -_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate), ""); - -_Static_assert(__builtin_hlsl_is_intangible(RWBuffer), ""); -_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer), ""); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s +// expected-no-diagnostics + +_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), ""); +// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported + +_Static_assert(!__builtin_hlsl_is_intangible(int), ""); +_Static_assert(!__builtin_hlsl_is_intangible(float3), ""); +_Static_assert(!__builtin_hlsl_is_intangible(half[4]), ""); + +typedef __hlsl_resource_t Res; +_Static_assert(__builtin_hlsl_is_intangible(const Res), ""); +// no need to check array of Res, arrays of sizeless types are not supported + +struct ABuffer { + const int i[10]; + __hlsl_resource_t h; +}; +_Static_assert(__builtin_hlsl_is_intangible(ABuffer), ""); +_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), ""); + +struct MyStruct { + half2 h2; + int3 i3; +}; +_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), ""); +_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), ""); + +class MyClass { + int3 ivec; + float farray[12]; + MyStruct ms; + ABuffer buf; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyClass), ""); +_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), ""); + +union U { + double d[4]; + Res buf; +}; +_Static_assert(__builtin_hlsl_is_intangible(U), ""); +_Static_assert(__builtin_hlsl_is_intangible(U[100]), ""); + +class MyClass2 { + int3 ivec; + float farray[12]; + U u; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyClass2), ""); +_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), ""); + +class Simple { + int a; +}; + +template struct TemplatedBuffer { + T a; + __hlsl_resource_t h; +}; +_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer), ""); + +struct MyStruct2 : TemplatedBuffer { + float x; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), ""); + +struct MyStruct3 { + const TemplatedBuffer TB[10]; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), ""); + +template struct SimpleTemplate { + T a; +}; +_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), ""); +_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate), ""); + +_Static_assert(__builtin_hlsl_is_intangible(RWBuffer), ""); +_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer), ""); diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl index de9ac90b895fc63d955b4fa3e0b50fc702063454..33614e87640dade4ecfb09893a7ad014b4735559 100644 --- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl +++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl @@ -1,12 +1,12 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s - -struct Undefined; // expected-note {{forward declaration of 'Undefined'}} -_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}} - -void fn(int X) { // expected-note {{declared here}} - // expected-error@#vla {{variable length arrays are not supported for the current target}} - // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}} - // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}} - // expected-note@#vla {{function parameter 'X' with unknown value cannot be used in a constant expression}} - _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s + +struct Undefined; // expected-note {{forward declaration of 'Undefined'}} +_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}} + +void fn(int X) { // expected-note {{declared here}} + // expected-error@#vla {{variable length arrays are not supported for the current target}} + // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}} + // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}} + // expected-note@#vla {{function parameter 'X' with unknown value cannot be used in a constant expression}} + _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla +} diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl index 760c057630a7fa06da8478d5dd3c6d9358abdf62..4e50f70952ad13db193dda854a3c4921895a6272 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl @@ -1,42 +1,42 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -// expected-error@+1{{binding type 't' only applies to SRV resources}} -float f1 : register(t0); - -// expected-error@+1 {{binding type 'u' only applies to UAV resources}} -float f2 : register(u0); - -// expected-error@+1{{binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported}} -float f3 : register(b9); - -// expected-error@+1 {{binding type 's' only applies to sampler state}} -float f4 : register(s0); - -// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} -float f5 : register(i9); - -// expected-error@+1{{binding type 'x' is invalid}} -float f6 : register(x9); - -cbuffer g_cbuffer1 { -// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} - float f7 : register(c2); -}; - -tbuffer g_tbuffer1 { -// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} - float f8 : register(c2); -}; - -cbuffer g_cbuffer2 { -// expected-error@+1{{binding type 'b' only applies to constant buffer resources}} - float f9 : register(b2); -}; - -tbuffer g_tbuffer2 { -// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} - float f10 : register(i2); -}; - -// expected-error@+1{{binding type 'c' only applies to numeric variables in the global scope}} -RWBuffer f11 : register(c3); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// expected-error@+1{{binding type 't' only applies to SRV resources}} +float f1 : register(t0); + +// expected-error@+1 {{binding type 'u' only applies to UAV resources}} +float f2 : register(u0); + +// expected-error@+1{{binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported}} +float f3 : register(b9); + +// expected-error@+1 {{binding type 's' only applies to sampler state}} +float f4 : register(s0); + +// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} +float f5 : register(i9); + +// expected-error@+1{{binding type 'x' is invalid}} +float f6 : register(x9); + +cbuffer g_cbuffer1 { +// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} + float f7 : register(c2); +}; + +tbuffer g_tbuffer1 { +// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} + float f8 : register(c2); +}; + +cbuffer g_cbuffer2 { +// expected-error@+1{{binding type 'b' only applies to constant buffer resources}} + float f9 : register(b2); +}; + +tbuffer g_tbuffer2 { +// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} + float f10 : register(i2); +}; + +// expected-error@+1{{binding type 'c' only applies to numeric variables in the global scope}} +RWBuffer f11 : register(c3); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl index 4c9e9a6b44c92814d164a19c1e32c59ba5f9c13d..503c8469666f3b83be9ef08eb950b03262d52a30 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl @@ -1,9 +1,9 @@ -// RUN: not %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s | FileCheck %s - -// XFAIL: * -// This expectedly fails because RayQuery is an unsupported type. -// When it becomes supported, we should expect an error due to -// the variable type being classified as "other", and according -// to the spec, err_hlsl_unsupported_register_type_and_variable_type -// should be emitted. -RayQuery<0> r1: register(t0); +// RUN: not %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s | FileCheck %s + +// XFAIL: * +// This expectedly fails because RayQuery is an unsupported type. +// When it becomes supported, we should expect an error due to +// the variable type being classified as "other", and according +// to the spec, err_hlsl_unsupported_register_type_and_variable_type +// should be emitted. +RayQuery<0> r1: register(t0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl index 4b6af47c0ab7254f1608916330abdede349a6552..ea43e27b5b5ac175b0a06033069514fec83639f5 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl @@ -1,49 +1,49 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -// This test validates the diagnostics that are emitted when a variable with a "resource" type -// is bound to a register using the register annotation - - -template -struct MyTemplatedSRV { - __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; -}; - -struct MySRV { - __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; -}; - -struct MySampler { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; -}; - -struct MyUAV { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; -}; - -struct MyCBuffer { - __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; -}; - - -// expected-error@+1 {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} -MySRV invalid : register(i2); - -// expected-error@+1 {{binding type 't' only applies to SRV resources}} -MyUAV a : register(t2, space1); - -// expected-error@+1 {{binding type 'u' only applies to UAV resources}} -MySampler b : register(u2, space1); - -// expected-error@+1 {{binding type 'b' only applies to constant buffer resources}} -MyTemplatedSRV c : register(b2); - -// expected-error@+1 {{binding type 's' only applies to sampler state}} -MyUAV d : register(s2, space1); - -// empty binding prefix cases: -// expected-error@+1 {{expected identifier}} -MyTemplatedSRV e: register(); - -// expected-error@+1 {{expected identifier}} -MyTemplatedSRV f: register(""); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// This test validates the diagnostics that are emitted when a variable with a "resource" type +// is bound to a register using the register annotation + + +template +struct MyTemplatedSRV { + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; +}; + +struct MySRV { + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; +}; + +struct MySampler { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; +}; + +struct MyUAV { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; +}; + +struct MyCBuffer { + __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; +}; + + +// expected-error@+1 {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} +MySRV invalid : register(i2); + +// expected-error@+1 {{binding type 't' only applies to SRV resources}} +MyUAV a : register(t2, space1); + +// expected-error@+1 {{binding type 'u' only applies to UAV resources}} +MySampler b : register(u2, space1); + +// expected-error@+1 {{binding type 'b' only applies to constant buffer resources}} +MyTemplatedSRV c : register(b2); + +// expected-error@+1 {{binding type 's' only applies to sampler state}} +MyUAV d : register(s2, space1); + +// empty binding prefix cases: +// expected-error@+1 {{expected identifier}} +MyTemplatedSRV e: register(); + +// expected-error@+1 {{expected identifier}} +MyTemplatedSRV f: register(""); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl index e63f264452da7983687c01ab6d635fab14a783f3..7f248e30c0709687a1a6b65bf5bdd9f3257592b1 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl @@ -1,27 +1,27 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify - -// expected-no-diagnostics -float f2 : register(b9); - -float f3 : register(i9); - -cbuffer g_cbuffer1 { - float f4 : register(c2); -}; - - -struct Eg12{ - RWBuffer a; -}; - -Eg12 e12 : register(c9); - -Eg12 bar : register(i1); - -struct Eg7 { - struct Bar { - float f; - }; - Bar b; -}; -Eg7 e7 : register(t0); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify + +// expected-no-diagnostics +float f2 : register(b9); + +float f3 : register(i9); + +cbuffer g_cbuffer1 { + float f4 : register(c2); +}; + + +struct Eg12{ + RWBuffer a; +}; + +Eg12 e12 : register(c9); + +Eg12 bar : register(i1); + +struct Eg7 { + struct Bar { + float f; + }; + Bar b; +}; +Eg7 e7 : register(t0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl index 70e64e6ca75280e4afb933c51c54ba0bce3d5aaf..3001dbb1e3ec96d015b413f197655d7dab187cf5 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl @@ -1,62 +1,62 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -// valid -cbuffer cbuf { - RWBuffer r : register(u0, space0); -} - -cbuffer cbuf2 { - struct x { - // this test validates that no diagnostic is emitted on the space parameter, because - // this register annotation is not in the global scope. - // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} - RWBuffer E : register(u2, space3); - }; -} - -struct MyStruct { - RWBuffer E; -}; - -cbuffer cbuf3 { - // valid - MyStruct E : register(u2, space3); -} - -// valid -MyStruct F : register(u3, space4); - -cbuffer cbuf4 { - // this test validates that no diagnostic is emitted on the space parameter, because - // this register annotation is not in the global scope. - // expected-error@+1 {{binding type 'u' only applies to UAV resources}} - float a : register(u2, space3); -} - -// expected-error@+1 {{invalid space specifier 's2' used; expected 'space' followed by an integer, like space1}} -cbuffer a : register(b0, s2) { - -} - -// expected-error@+1 {{invalid space specifier 'spaces' used; expected 'space' followed by an integer, like space1}} -cbuffer b : register(b2, spaces) { - -} - -// expected-error@+1 {{wrong argument format for hlsl attribute, use space3 instead}} -cbuffer c : register(b2, space 3) {} - -// expected-error@+1 {{register space cannot be specified on global constants}} -int d : register(c2, space3); - -// expected-error@+1 {{register space cannot be specified on global constants}} -int e : register(c2, space0); - -// expected-error@+1 {{register space cannot be specified on global constants}} -int f : register(c2, space00); - -// valid -RWBuffer g : register(u2, space0); - -// valid -RWBuffer h : register(u2, space0); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// valid +cbuffer cbuf { + RWBuffer r : register(u0, space0); +} + +cbuffer cbuf2 { + struct x { + // this test validates that no diagnostic is emitted on the space parameter, because + // this register annotation is not in the global scope. + // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + RWBuffer E : register(u2, space3); + }; +} + +struct MyStruct { + RWBuffer E; +}; + +cbuffer cbuf3 { + // valid + MyStruct E : register(u2, space3); +} + +// valid +MyStruct F : register(u3, space4); + +cbuffer cbuf4 { + // this test validates that no diagnostic is emitted on the space parameter, because + // this register annotation is not in the global scope. + // expected-error@+1 {{binding type 'u' only applies to UAV resources}} + float a : register(u2, space3); +} + +// expected-error@+1 {{invalid space specifier 's2' used; expected 'space' followed by an integer, like space1}} +cbuffer a : register(b0, s2) { + +} + +// expected-error@+1 {{invalid space specifier 'spaces' used; expected 'space' followed by an integer, like space1}} +cbuffer b : register(b2, spaces) { + +} + +// expected-error@+1 {{wrong argument format for hlsl attribute, use space3 instead}} +cbuffer c : register(b2, space 3) {} + +// expected-error@+1 {{register space cannot be specified on global constants}} +int d : register(c2, space3); + +// expected-error@+1 {{register space cannot be specified on global constants}} +int e : register(c2, space0); + +// expected-error@+1 {{register space cannot be specified on global constants}} +int f : register(c2, space00); + +// valid +RWBuffer g : register(u2, space0); + +// valid +RWBuffer h : register(u2, space0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl index ea2d576e4cca555b3d2876b6f18487d67fc127c2..235004102a539ba3fc0b6c0eacea346072dde561 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl @@ -1,135 +1,135 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -template -struct MyTemplatedUAV { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; -}; - -struct MySRV { - __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; -}; - -struct MySampler { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; -}; - -struct MyUAV { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; -}; - -struct MyCBuffer { - __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; -}; - -// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0 -struct Eg1 { - float f; - MySRV SRVBuf; - MyUAV UAVBuf; - }; -Eg1 e1 : register(t0) : register(u0); - -// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0. -// UAVBuf2 gets automatically assigned to u1 even though there is no explicit binding for u1. -struct Eg2 { - float f; - MySRV SRVBuf; - MyUAV UAVBuf; - MyUAV UAVBuf2; - }; -Eg2 e2 : register(t0) : register(u0); - -// Valid: Bar, the struct within Eg3, has a valid resource that can be bound to t0. -struct Eg3 { - struct Bar { - MyUAV a; - }; - Bar b; -}; -Eg3 e3 : register(u0); - -// Valid: the first sampler state object within 's' is bound to slot 5 -struct Eg4 { - MySampler s[3]; -}; - -Eg4 e4 : register(s5); - - -struct Eg5 { - float f; -}; -// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} -Eg5 e5 : register(t0); - -struct Eg6 { - float f; -}; -// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} -Eg6 e6 : register(u0); - -struct Eg7 { - float f; -}; -// expected-warning@+1{{binding type 'b' only applies to types containing constant buffer resources}} -Eg7 e7 : register(b0); - -struct Eg8 { - float f; -}; -// expected-warning@+1{{binding type 's' only applies to types containing sampler state}} -Eg8 e8 : register(s0); - -struct Eg9 { - MySRV s; -}; -// expected-warning@+1{{binding type 'c' only applies to types containing numeric types}} -Eg9 e9 : register(c0); - -struct Eg10{ - // expected-error@+1{{'register' attribute only applies to cbuffer/tbuffer and external global variables}} - MyTemplatedUAV a : register(u9); -}; -Eg10 e10; - - -template -struct Eg11 { - R b; -}; -// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} -Eg11 e11 : register(u0); -// invalid because after template expansion, there are no valid resources inside Eg11 to bind as a UAV, only an SRV - - -struct Eg12{ - MySRV s1; - MySRV s2; -}; -// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}} -// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}} -// expected-error@+1{{binding type 'u' cannot be applied more than once}} -Eg12 e12 : register(u9) : register(u10); - -struct Eg13{ - MySRV s1; - MySRV s2; -}; -// expected-warning@+4{{binding type 'u' only applies to types containing UAV resources}} -// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}} -// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}} -// expected-error@+1{{binding type 'u' cannot be applied more than once}} -Eg13 e13 : register(u9) : register(u10) : register(u11); - -struct Eg14{ - MyTemplatedUAV r1; -}; -// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} -Eg14 e14 : register(t9); - -struct Eg15 { - float f[4]; -}; -// expected no error -Eg15 e15 : register(c0); - +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +template +struct MyTemplatedUAV { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; +}; + +struct MySRV { + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; +}; + +struct MySampler { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; +}; + +struct MyUAV { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; +}; + +struct MyCBuffer { + __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; +}; + +// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0 +struct Eg1 { + float f; + MySRV SRVBuf; + MyUAV UAVBuf; + }; +Eg1 e1 : register(t0) : register(u0); + +// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0. +// UAVBuf2 gets automatically assigned to u1 even though there is no explicit binding for u1. +struct Eg2 { + float f; + MySRV SRVBuf; + MyUAV UAVBuf; + MyUAV UAVBuf2; + }; +Eg2 e2 : register(t0) : register(u0); + +// Valid: Bar, the struct within Eg3, has a valid resource that can be bound to t0. +struct Eg3 { + struct Bar { + MyUAV a; + }; + Bar b; +}; +Eg3 e3 : register(u0); + +// Valid: the first sampler state object within 's' is bound to slot 5 +struct Eg4 { + MySampler s[3]; +}; + +Eg4 e4 : register(s5); + + +struct Eg5 { + float f; +}; +// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} +Eg5 e5 : register(t0); + +struct Eg6 { + float f; +}; +// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} +Eg6 e6 : register(u0); + +struct Eg7 { + float f; +}; +// expected-warning@+1{{binding type 'b' only applies to types containing constant buffer resources}} +Eg7 e7 : register(b0); + +struct Eg8 { + float f; +}; +// expected-warning@+1{{binding type 's' only applies to types containing sampler state}} +Eg8 e8 : register(s0); + +struct Eg9 { + MySRV s; +}; +// expected-warning@+1{{binding type 'c' only applies to types containing numeric types}} +Eg9 e9 : register(c0); + +struct Eg10{ + // expected-error@+1{{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + MyTemplatedUAV a : register(u9); +}; +Eg10 e10; + + +template +struct Eg11 { + R b; +}; +// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} +Eg11 e11 : register(u0); +// invalid because after template expansion, there are no valid resources inside Eg11 to bind as a UAV, only an SRV + + +struct Eg12{ + MySRV s1; + MySRV s2; +}; +// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}} +// expected-error@+1{{binding type 'u' cannot be applied more than once}} +Eg12 e12 : register(u9) : register(u10); + +struct Eg13{ + MySRV s1; + MySRV s2; +}; +// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}} +// expected-error@+2{{binding type 'u' cannot be applied more than once}} +// expected-error@+1{{binding type 'u' cannot be applied more than once}} +Eg13 e13 : register(u9) : register(u10) : register(u11); + +// expected-error@+1{{binding type 't' cannot be applied more than once}} +Eg13 e13_2 : register(t11) : register(t12); + +struct Eg14{ + MyTemplatedUAV r1; +}; +// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} +Eg14 e14 : register(t9); + +struct Eg15 { + float f[4]; +}; +// expected no error +Eg15 e15 : register(c0); diff --git a/clang/test/SemaObjC/aarch64-sve-types.m b/clang/test/SemaObjC/aarch64-sve-types.m index b50f43cee76f6cfeced097c2020530e0cd7ecc96..a45e02217667fc55fae61fa6bb42ff8a094298ee 100644 --- a/clang/test/SemaObjC/aarch64-sve-types.m +++ b/clang/test/SemaObjC/aarch64-sve-types.m @@ -20,5 +20,7 @@ @property(nullable) __SVBfloat16_t bf16; // expected-error {{cannot be applied to non-pointer type}} +@property(nullable) __SVMfloat8_t mf8; // expected-error {{cannot be applied to non-pointer type}} + @property(nullable) __SVBool_t b8; // expected-error {{cannot be applied to non-pointer type}} @end diff --git a/clang/tools/scan-build/bin/scan-build.bat b/clang/tools/scan-build/bin/scan-build.bat index 77be6746318f117ff8ebf205663b15f004a1655e..f765f205b8ec504ea596f22ac4699921bd8afa86 100644 --- a/clang/tools/scan-build/bin/scan-build.bat +++ b/clang/tools/scan-build/bin/scan-build.bat @@ -1 +1 @@ -perl -S scan-build %* +perl -S scan-build %* diff --git a/clang/tools/scan-build/libexec/c++-analyzer.bat b/clang/tools/scan-build/libexec/c++-analyzer.bat index 69f048a91671f0f27244f1ddd984ca1bef1e087d..83c7172456a51a45f25f23733f81483ea19498e3 100644 --- a/clang/tools/scan-build/libexec/c++-analyzer.bat +++ b/clang/tools/scan-build/libexec/c++-analyzer.bat @@ -1 +1 @@ -perl -S c++-analyzer %* +perl -S c++-analyzer %* diff --git a/clang/tools/scan-build/libexec/ccc-analyzer.bat b/clang/tools/scan-build/libexec/ccc-analyzer.bat index 2a85376eb82b168948757d1344d15a83b565c653..fdd36f3bdd0437f7be2a948f12385b42accc2c38 100644 --- a/clang/tools/scan-build/libexec/ccc-analyzer.bat +++ b/clang/tools/scan-build/libexec/ccc-analyzer.bat @@ -1 +1 @@ -perl -S ccc-analyzer %* +perl -S ccc-analyzer %* diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt index 12fee5dc2789ceb5e38a4d9722767ab808a95ed8..4e1819bfa166a8690279373c17e709e9c76d52c6 100644 --- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt @@ -7,6 +7,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests ArenaTest.cpp ASTOpsTest.cpp CFGMatchSwitchTest.cpp + CachedConstAccessorsLatticeTest.cpp ChromiumCheckModelTest.cpp DataflowAnalysisContextTest.cpp DataflowEnvironmentTest.cpp diff --git a/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6488833bd14cf2151e6cb2ca7fd6e029cbb33a61 --- /dev/null +++ b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp @@ -0,0 +1,305 @@ +//===- unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp ==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h" + +#include +#include + +#include "clang/AST/Decl.h" +#include "clang/AST/DeclBase.h" +#include "clang/AST/DeclCXX.h" +#include "clang/AST/Expr.h" +#include "clang/AST/Type.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/ASTMatchers/ASTMatchers.h" +#include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h" +#include "clang/Analysis/FlowSensitive/DataflowLattice.h" +#include "clang/Analysis/FlowSensitive/NoopLattice.h" +#include "clang/Analysis/FlowSensitive/StorageLocation.h" +#include "clang/Analysis/FlowSensitive/Value.h" +#include "clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h" +#include "clang/Basic/LLVM.h" +#include "clang/Testing/TestAST.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang::dataflow { +namespace { + +using ast_matchers::BoundNodes; +using ast_matchers::callee; +using ast_matchers::cxxMemberCallExpr; +using ast_matchers::functionDecl; +using ast_matchers::hasName; +using ast_matchers::match; +using ast_matchers::selectFirst; + +using dataflow::DataflowAnalysisContext; +using dataflow::Environment; +using dataflow::LatticeJoinEffect; +using dataflow::RecordStorageLocation; +using dataflow::Value; +using dataflow::WatchedLiteralsSolver; + +using testing::SizeIs; + +NamedDecl *lookup(StringRef Name, const DeclContext &DC) { + auto Result = DC.lookup(&DC.getParentASTContext().Idents.get(Name)); + EXPECT_TRUE(Result.isSingleResult()) << Name; + return Result.front(); +} + +class CachedConstAccessorsLatticeTest : public ::testing::Test { +protected: + using LatticeT = CachedConstAccessorsLattice; + + DataflowAnalysisContext DACtx{std::make_unique()}; + Environment Env{DACtx}; +}; + +// Basic test AST with two const methods (return a value, and return a ref). +struct CommonTestInputs { + CommonTestInputs() + : AST(R"cpp( + struct S { + int *valProperty() const; + int &refProperty() const; + }; + void target() { + S s; + s.valProperty(); + S s2; + s2.refProperty(); + } + )cpp") { + auto *SDecl = cast( + lookup("S", *AST.context().getTranslationUnitDecl())); + SType = AST.context().getRecordType(SDecl); + CallVal = selectFirst( + "call", + match(cxxMemberCallExpr(callee(functionDecl(hasName("valProperty")))) + .bind("call"), + AST.context())); + assert(CallVal != nullptr); + + CallRef = selectFirst( + "call", + match(cxxMemberCallExpr(callee(functionDecl(hasName("refProperty")))) + .bind("call"), + AST.context())); + assert(CallRef != nullptr); + } + + TestAST AST; + QualType SType; + const CallExpr *CallVal; + const CallExpr *CallRef; +}; + +TEST_F(CachedConstAccessorsLatticeTest, + SamePrimitiveValBeforeClearOrDiffAfterClear) { + CommonTestInputs Inputs; + auto *CE = Inputs.CallVal; + RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(), + {}); + + LatticeT Lattice; + Value *Val1 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env); + Value *Val2 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env); + + EXPECT_EQ(Val1, Val2); + + Lattice.clearConstMethodReturnValues(Loc); + Value *Val3 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env); + + EXPECT_NE(Val3, Val1); + EXPECT_NE(Val3, Val2); +} + +TEST_F(CachedConstAccessorsLatticeTest, SameLocBeforeClearOrDiffAfterClear) { + CommonTestInputs Inputs; + auto *CE = Inputs.CallRef; + RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(), + {}); + + LatticeT Lattice; + auto NopInit = [](StorageLocation &) {}; + StorageLocation *Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation( + Loc, CE, Env, NopInit); + auto NotCalled = [](StorageLocation &) { + ASSERT_TRUE(false) << "Not reached"; + }; + StorageLocation *Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation( + Loc, CE, Env, NotCalled); + + EXPECT_EQ(Loc1, Loc2); + + Lattice.clearConstMethodReturnStorageLocations(Loc); + StorageLocation *Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation( + Loc, CE, Env, NopInit); + + EXPECT_NE(Loc3, Loc1); + EXPECT_NE(Loc3, Loc2); +} + +TEST_F(CachedConstAccessorsLatticeTest, + SameStructValBeforeClearOrDiffAfterClear) { + TestAST AST(R"cpp( + struct S { + S structValProperty() const; + }; + void target() { + S s; + s.structValProperty(); + } + )cpp"); + auto *SDecl = + cast(lookup("S", *AST.context().getTranslationUnitDecl())); + QualType SType = AST.context().getRecordType(SDecl); + const CallExpr *CE = selectFirst( + "call", match(cxxMemberCallExpr( + callee(functionDecl(hasName("structValProperty")))) + .bind("call"), + AST.context())); + ASSERT_NE(CE, nullptr); + + RecordStorageLocation Loc(SType, RecordStorageLocation::FieldToLoc(), {}); + + LatticeT Lattice; + // Accessors that return a record by value are modeled by a record storage + // location (instead of a Value). + auto NopInit = [](StorageLocation &) {}; + StorageLocation *Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation( + Loc, CE, Env, NopInit); + auto NotCalled = [](StorageLocation &) { + ASSERT_TRUE(false) << "Not reached"; + }; + StorageLocation *Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation( + Loc, CE, Env, NotCalled); + + EXPECT_EQ(Loc1, Loc2); + + Lattice.clearConstMethodReturnStorageLocations(Loc); + StorageLocation *Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation( + Loc, CE, Env, NopInit); + + EXPECT_NE(Loc3, Loc1); + EXPECT_NE(Loc3, Loc1); +} + +TEST_F(CachedConstAccessorsLatticeTest, ClearDifferentLocs) { + CommonTestInputs Inputs; + auto *CE = Inputs.CallRef; + RecordStorageLocation LocS1(Inputs.SType, RecordStorageLocation::FieldToLoc(), + {}); + RecordStorageLocation LocS2(Inputs.SType, RecordStorageLocation::FieldToLoc(), + {}); + + LatticeT Lattice; + auto NopInit = [](StorageLocation &) {}; + StorageLocation *RetLoc1 = + Lattice.getOrCreateConstMethodReturnStorageLocation(LocS1, CE, Env, + NopInit); + Lattice.clearConstMethodReturnStorageLocations(LocS2); + auto NotCalled = [](StorageLocation &) { + ASSERT_TRUE(false) << "Not reached"; + }; + StorageLocation *RetLoc2 = + Lattice.getOrCreateConstMethodReturnStorageLocation(LocS1, CE, Env, + NotCalled); + + EXPECT_EQ(RetLoc1, RetLoc2); +} + +TEST_F(CachedConstAccessorsLatticeTest, DifferentValsFromDifferentLocs) { + TestAST AST(R"cpp( + struct S { + int *valProperty() const; + }; + void target() { + S s1; + s1.valProperty(); + S s2; + s2.valProperty(); + } + )cpp"); + auto *SDecl = + cast(lookup("S", *AST.context().getTranslationUnitDecl())); + QualType SType = AST.context().getRecordType(SDecl); + SmallVector valPropertyCalls = + match(cxxMemberCallExpr(callee(functionDecl(hasName("valProperty")))) + .bind("call"), + AST.context()); + ASSERT_THAT(valPropertyCalls, SizeIs(2)); + + const CallExpr *CE1 = selectFirst("call", valPropertyCalls); + ASSERT_NE(CE1, nullptr); + + valPropertyCalls.erase(valPropertyCalls.begin()); + const CallExpr *CE2 = selectFirst("call", valPropertyCalls); + ASSERT_NE(CE2, nullptr); + ASSERT_NE(CE1, CE2); + + RecordStorageLocation LocS1(SType, RecordStorageLocation::FieldToLoc(), {}); + RecordStorageLocation LocS2(SType, RecordStorageLocation::FieldToLoc(), {}); + + LatticeT Lattice; + Value *Val1 = Lattice.getOrCreateConstMethodReturnValue(LocS1, CE1, Env); + Value *Val2 = Lattice.getOrCreateConstMethodReturnValue(LocS2, CE2, Env); + + EXPECT_NE(Val1, Val2); +} + +TEST_F(CachedConstAccessorsLatticeTest, JoinSameNoop) { + CommonTestInputs Inputs; + auto *CE = Inputs.CallVal; + RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(), + {}); + + LatticeT EmptyLattice; + LatticeT EmptyLattice2; + EXPECT_EQ(EmptyLattice.join(EmptyLattice2), LatticeJoinEffect::Unchanged); + + LatticeT Lattice1; + Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env); + EXPECT_EQ(Lattice1.join(Lattice1), LatticeJoinEffect::Unchanged); +} + +TEST_F(CachedConstAccessorsLatticeTest, ProducesNewValueAfterJoinDistinct) { + CommonTestInputs Inputs; + auto *CE = Inputs.CallVal; + RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(), + {}); + + // L1 w/ v vs L2 empty + LatticeT Lattice1; + Value *Val1 = Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env); + + LatticeT EmptyLattice; + + EXPECT_EQ(Lattice1.join(EmptyLattice), LatticeJoinEffect::Changed); + Value *ValAfterJoin = + Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env); + + EXPECT_NE(ValAfterJoin, Val1); + + // L1 w/ v1 vs L3 w/ v2 + LatticeT Lattice3; + Value *Val3 = Lattice3.getOrCreateConstMethodReturnValue(Loc, CE, Env); + + EXPECT_EQ(Lattice1.join(Lattice3), LatticeJoinEffect::Changed); + Value *ValAfterJoin2 = + Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env); + + EXPECT_NE(ValAfterJoin2, ValAfterJoin); + EXPECT_NE(ValAfterJoin2, Val3); +} + +} // namespace +} // namespace clang::dataflow diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 318f08c04759b9d9ab4bf4eee5a08c2899e5023a..9e8529050ed83d661470af2239b1c1bb20dfc9f2 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -184,6 +184,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_BOOL(ObjCSpaceBeforeProtocolList); CHECK_PARSE_BOOL(Cpp11BracedListStyle); CHECK_PARSE_BOOL(RemoveBracesLLVM); + CHECK_PARSE_BOOL(RemoveEmptyLinesInUnwrappedLines); CHECK_PARSE_BOOL(RemoveSemicolon); CHECK_PARSE_BOOL(SkipMacroDefinitionBody); CHECK_PARSE_BOOL(SpacesInSquareBrackets); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 43513f18321bc08c68b6bd2172cb11da9ddefbcd..8f4c92148adae4accd5b14c0eddd090ad8907817 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -28135,6 +28135,83 @@ TEST_F(FormatTest, BreakBinaryOperations) { Style); } +TEST_F(FormatTest, RemovesEmptyLinesInUnwrappedLines) { + auto Style = getLLVMStyle(); + Style.RemoveEmptyLinesInUnwrappedLines = true; + + verifyFormat("int c = a + b;", + "int c\n" + "\n" + " = a + b;", + Style); + + verifyFormat("enum : unsigned { AA = 0, BB } myEnum;", + "enum : unsigned\n" + "\n" + "{\n" + " AA = 0,\n" + " BB\n" + "} myEnum;", + Style); + + verifyFormat("class B : public E {\n" + "private:\n" + "};", + "class B : public E\n" + "\n" + "{\n" + "private:\n" + "};", + Style); + + verifyFormat( + "struct AAAAAAAAAAAAAAA test[3] = {{56, 23, \"hello\"}, {7, 5, \"!!\"}};", + "struct AAAAAAAAAAAAAAA test[3] = {{56,\n" + "\n" + " 23, \"hello\"},\n" + " {7, 5, \"!!\"}};", + Style); + + verifyFormat("int myFunction(int aaaaaaaaaaaaa, int ccccccccccccc, int d);", + "int myFunction(\n" + "\n" + " int aaaaaaaaaaaaa,\n" + "\n" + " int ccccccccccccc, int d);", + Style); + + verifyFormat("switch (e) {\n" + "case 1:\n" + " return e;\n" + "case 2:\n" + " return 2;\n" + "}", + "switch (\n" + "\n" + " e) {\n" + "case 1:\n" + " return e;\n" + "case 2:\n" + " return 2;\n" + "}", + Style); + + verifyFormat("while (true) {\n" + "}", + "while (\n" + "\n" + " true) {\n" + "}", + Style); + + verifyFormat("void loooonFunctionIsVeryLongButNotAsLongAsJavaTypeNames(\n" + " std::map *outputMap);", + "void loooonFunctionIsVeryLongButNotAsLongAsJavaTypeNames\n" + "\n" + " (std::map *outputMap);", + Style); +} + } // namespace } // namespace test } // namespace format diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 00776dac28a14b000e224ba0e92d2dd8b6a4c4c1..60deae0c9b1129a650c04727ef436ac9f9fa102b 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3554,6 +3554,12 @@ TEST_F(TokenAnnotatorTest, TemplateInstantiation) { ASSERT_EQ(Tokens.size(), 21u) << Tokens; EXPECT_TOKEN(Tokens[4], tok::less, TT_TemplateOpener); EXPECT_TOKEN(Tokens[16], tok::greater, TT_TemplateCloser); + + Tokens = + annotate("auto x{std::conditional_t{}};"); + ASSERT_EQ(Tokens.size(), 24u) << Tokens; + EXPECT_TOKEN(Tokens[6], tok::less, TT_TemplateOpener); + EXPECT_TOKEN(Tokens[18], tok::greater, TT_TemplateCloser); } } // namespace diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis index a7c70186bc46deee883556b89f14d83e4968d8a8..611c20dacce176f4d7cf4b63d64cdcde6390f4a5 100644 --- a/clang/utils/ClangVisualizers/clang.natvis +++ b/clang/utils/ClangVisualizers/clang.natvis @@ -1,1089 +1,1089 @@ - - - - - - - LocInfoType - {(clang::Type::TypeClass)TypeBits.TC, en}Type - - {*(clang::BuiltinType *)this} - {*(clang::PointerType *)this} - {*(clang::ParenType *)this} - {(clang::BitIntType *)this} - {*(clang::LValueReferenceType *)this} - {*(clang::RValueReferenceType *)this} - {(clang::ConstantArrayType *)this,na} - {(clang::ConstantArrayType *)this,view(left)na} - {(clang::ConstantArrayType *)this,view(right)na} - {(clang::VariableArrayType *)this,na} - {(clang::VariableArrayType *)this,view(left)na} - {(clang::VariableArrayType *)this,view(right)na} - {(clang::IncompleteArrayType *)this,na} - {(clang::IncompleteArrayType *)this,view(left)na} - {(clang::IncompleteArrayType *)this,view(right)na} - {(clang::TypedefType *)this,na} - {(clang::TypedefType *)this,view(cpp)na} - {*(clang::AttributedType *)this} - {(clang::DecayedType *)this,na} - {(clang::DecayedType *)this,view(left)na} - {(clang::DecayedType *)this,view(right)na} - {(clang::ElaboratedType *)this,na} - {(clang::ElaboratedType *)this,view(left)na} - {(clang::ElaboratedType *)this,view(right)na} - {*(clang::TemplateTypeParmType *)this} - {*(clang::TemplateTypeParmType *)this,view(cpp)} - {*(clang::SubstTemplateTypeParmType *)this} - {*(clang::RecordType *)this} - {*(clang::RecordType *)this,view(cpp)} - {(clang::FunctionProtoType *)this,na} - {(clang::FunctionProtoType *)this,view(left)na} - {(clang::FunctionProtoType *)this,view(right)na} - {*(clang::TemplateSpecializationType *)this} - {*(clang::DeducedTemplateSpecializationType *)this} - {*(clang::DeducedTemplateSpecializationType *)this,view(cpp)} - {*(clang::InjectedClassNameType *)this} - {*(clang::DependentNameType *)this} - {*(clang::PackExpansionType *)this} - {(clang::LocInfoType *)this,na} - {(clang::LocInfoType *)this,view(cpp)na} - {this,view(poly)na} - {*this,view(cpp)} - - No visualizer yet for {(clang::Type::TypeClass)TypeBits.TC,en}Type - Dependence{" ",en} - - CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en} CachedLocalOrUnnamed - CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en}{" ",sb} - - FromAST - - - No TypeBits set beyond TypeClass - - {*this, view(Dependence)}{*this, view(Cache)}{*this, view(FromAST)} - {*this,view(cmn)} {{{*this,view(poly)}}} - - (clang::Type::TypeClass)TypeBits.TC - this,view(flags)na - CanonicalType - *(clang::BuiltinType *)this - *(clang::PointerType *)this - *(clang::ParenType*)this - *(clang::BitIntType*)this - *(clang::LValueReferenceType *)this - *(clang::RValueReferenceType *)this - (clang::ConstantArrayType *)this - (clang::VariableArrayType *)this - (clang::IncompleteArrayType *)this - *(clang::AttributedType *)this - (clang::DecayedType *)this - (clang::ElaboratedType *)this - (clang::TemplateTypeParmType *)this - (clang::SubstTemplateTypeParmType *)this - (clang::RecordType *)this - (clang::FunctionProtoType *)this - (clang::TemplateSpecializationType *)this - (clang::DeducedTemplateSpecializationType *)this - (clang::InjectedClassNameType *)this - (clang::DependentNameType *)this - (clang::PackExpansionType *)this - (clang::LocInfoType *)this - - - - - ElementType - - - - {ElementType,view(cpp)} - [{Size}] - {ElementType,view(cpp)}[{Size}] - - Size - (clang::ArrayType *)this - - - - {ElementType,view(cpp)} - [] - {ElementType,view(cpp)}[] - - (clang::ArrayType *)this - - - - {ElementType,view(cpp)} - [*] - {ElementType,view(cpp)}[*] - - (clang::Expr *)SizeExpr - (clang::ArrayType *)this - - - - {Decl,view(name)nd} - {Decl} - - Decl - *(clang::Type *)this, view(cmn) - - - - {PointeeType, view(cpp)} * - - PointeeType - *(clang::Type *)this, view(cmn) - - - - {Inner, view(cpp)} - - Inner - *(clang::Type *)this, view(cmn) - - - - signed _BitInt({NumBits}) - unsigned _BitInt({NumBits})( - - NumBits - (clang::Type *)this, view(cmn) - - - - - {((clang::ReferenceType *)this)->PointeeType,view(cpp)} & - - *(clang::Type *)this, view(cmn) - PointeeType - - - - {((clang::ReferenceType *)this)->PointeeType,view(cpp)} && - - *(clang::Type *)this, view(cmn) - PointeeType - - - - {ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind} - - - - - {(clang::Decl::Kind)DeclContextBits.DeclKind,en}Decl - - (clang::Decl::Kind)DeclContextBits.DeclKind,en - - - - - FirstDecl - (clang::Decl *)(*(intptr_t *)NextInContextAndBits.Value.Data & ~3) - *this - - - - - - - Field {{{*(clang::DeclaratorDecl *)this,view(cpp)nd}}} - - - {*(clang::FunctionDecl *)this,nd} - Method {{{*this,view(cpp)}}} - - - Constructor {{{Name,view(cpp)}({*(clang::FunctionDecl *)this,view(parm0)nd})}} - - - Destructor {{~{Name,view(cpp)}()}} - - - typename - class - (not yet known if parameter pack) - ... - - {(TypeSourceInfo *)(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data&~3LL),view(cpp)} - {{InheritedInitializer}} - = {this,view(DefaultArg)na} - - {*this,view(TorC)} {*this,view(MaybeEllipses)}{Name,view(cpp)} {this,view(Initializer)na} - - - {*TemplatedDecl,view(cpp)} - template{TemplateParams,na} {*TemplatedDecl}; - - TemplateParams,na - TemplatedDecl,na - - - - - {(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} - {(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} - {(TypeDecl *)this,view(cpp)nand} - typedef {this,view(type)na} {this,view(name)na}; - - "Not yet calculated",sb - (bool)(*(uintptr_t *)MaybeModedTInfo.Value.Data & 2) - (clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) - (clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) - (TypeDecl *)this,nd - - - - {(TypedefNameDecl *)this,view(name)nand} - using {(TypedefNameDecl *)this,view(name)nand} = {(TypedefNameDecl *)this,view(type)nand} - - - {Name} - - - Kind={(UncommonTemplateNameStorage::Kind)Kind,en}, Size={Size} - - (UncommonTemplateNameStorage::Kind)Kind - Size - - - - {Bits}, - {this,view(cmn)na},{(OverloadedTemplateStorage*)this,na} - {this,view(cmn)na},{(AssumedTemplateStorage*)this,na} - {this,view(cmn)na},{(SubstTemplateTemplateParmStorage*)this,na} - {this,view(cmn)na},{(SubstTemplateTemplateParmPackStorage*)this,na} - {this,view(cmn)na} - - Bits - (OverloadedTemplateStorage*)this - (AssumedTemplateStorage*)this - (SubstTemplateTemplateParmStorage*)this - (SubstTemplateTemplateParmPackStorage*)this - - - - - - - {(clang::TemplateDecl *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::TemplateDecl *)(Val.Value & ~3LL),na} - - - {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),na} - - - {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),na} - - - {(clang::DependentTemplateName *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::DependentTemplateName *)(Val.Value & ~3LL),na} - - - "TemplateDecl",s8b - - (clang::TemplateDecl *)(Val.Value & ~3LL) - - "UncommonTemplateNameStorage",s8b - - (clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL) - - "QualifiedTemplateName",s8b - - (clang::QualifiedTemplateName *)(Val.Value & ~3LL) - - "DependentTemplateName",s8b - - (clang::DependentTemplateName *)(Val.Value & ~3LL) - - Val - - - - - {Storage,view(cpp)na} - {Storage,na} - - Storage - - - - {Name,view(cpp)} - {Name} - - - implicit{" ",sb} - - {*this,view(implicit)nd} - {*this,view(modifiers)}{Name,view(cpp)} - {*this,view(modifiers)nd}struct {Name,view(cpp)} - {*this,view(modifiers)nd}interface {Name,view(cpp)} - {*this,view(modifiers)nd}union {Name,view(cpp)} - {*this,view(modifiers)nd}class {Name,view(cpp)} - {*this,view(modifiers)nd}enum {Name,view(cpp)} - - (clang::DeclContext *)this - - - - {decl,view(cpp)na} - {*decl} - - *(clang::Type *)this, view(cmn) - decl - - - - {(clang::TagType *)this,view(cpp)na} - {(clang::TagType *)this,na} - - *(clang::TagType *)this - - - - {{{*Replaced,view(cpp)} <= {CanonicalType,view(cpp)}}} - - *(clang::Type *)this, view(cmn) - *Replaced - - - - - - {ResultType,view(cpp)} - - {*(clang::QualType *)(this+1),view(cpp)}{*this,view(parm1)} - - , {*((clang::QualType *)(this+1)+1),view(cpp)}{*this,view(parm2)} - - , {*((clang::QualType *)(this+1)+2),view(cpp)}{*this,view(parm3)} - - , {*((clang::QualType *)(this+1)+3),view(cpp)}{*this,view(parm4)} - - , {*((clang::QualType *)(this+1)+4),view(cpp)}{*this,view(parm5)} - - , /* expand for more params */ - ({*this,view(parm0)}) -> {ResultType,view(cpp)} - ({*this,view(parm0)}) - {this,view(left)na}{this,view(right)na} - - ResultType - - {*this,view(parm0)} - - - FunctionTypeBits.NumParams - (clang::QualType *)(this+1) - - - - *(clang::Type *)this, view(cmn) - - - - - {OriginalTy} adjusted to {AdjustedTy} - - OriginalTy - AdjustedTy - - - - {OriginalTy,view(left)} - {OriginalTy,view(right)} - {OriginalTy} - - (clang::AdjustedType *)this - - - - {NamedType,view(left)} - {NamedType,view(right)} - {NamedType} - - (clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword - NNS - NamedType,view(cmn) - - - - {TTPDecl->Name,view(cpp)} - Non-canonical: {*TTPDecl} - Canonical: {CanTTPTInfo} - - *(clang::Type *)this, view(cmn) - - - - {Decl,view(cpp)} - - Decl - InjectedType - *(clang::Type *)this, view(cmn) - - - - {NNS}{Name,view(cpp)na} - - NNS - Name - *(clang::Type *)this, view(cmn) - - - - - {(IdentifierInfo*)Specifier,view(cpp)na}:: - {(NamedDecl*)Specifier,view(cpp)na}:: - {(Type*)Specifier,view(cpp)na}:: - - (NestedNameSpecifier::StoredSpecifierKind)((*(uintptr_t *)Prefix.Value.Data>>1)&3) - - - - {Pattern} - - Pattern - NumExpansions - *(clang::Type *)this, view(cmn) - - - - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(poly)}{*this,view(fastQuals)} - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(cpp)}{*this,view(fastQuals)} - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(left)}{*this,view(fastQuals)} - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(right)}{*this,view(fastQuals)} - - - {" ",sb}const - {" ",sb}restrict - {" ",sb}const restrict - {" ",sb}volatile - {" ",sb}const volatile - {" ",sb}volatile restrict - {" ",sb}const volatile restrict - Cannot visualize non-fast qualifiers - Null - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,na}{*this,view(fastQuals)} - - *this,view(fastQuals) - ((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType - - - - - {DeclInfo,view(cpp)na} - {DeclInfo,na} - - DeclInfo - *(clang::Type *)this, view(cmn) - - - - {Ty,view(cpp)} - {Ty} - - Ty - - - - {(QualType *)&Ty,na} - - (QualType *)&Ty - Data - - - - Not building anything - Building a {LastTy} - - - {Argument,view(cpp)} - {Argument} - - - {*(clang::QualType *)&TypeOrValue.V,view(cpp)} - {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} template argument: {*(clang::QualType *)&TypeOrValue.V} - - {Args.Args[0]}{*this,view(arg1)} - - , {Args.Args[1]}{*this,view(arg2)} - - , {Args.Args[2]}, ... - - {Args.Args[0],view(cpp)}{*this,view(arg1cpp)} - - , {Args.Args[1],view(cpp)}{*this,view(arg2cpp)} - - , {Args.Args[2],view(cpp)}, ... - {*this,view(arg0cpp)} - {*this,view(arg0)} - {(clang::Expr *)TypeOrValue.V,view(cpp)na} - {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} - - *(clang::QualType *)&TypeOrValue.V - (clang::Expr *)TypeOrValue.V - - Args.NumArgs - Args.Args - - - - - - - {((TemplateArgumentLoc*)Arguments.BeginX)[0],view(cpp)}{*this,view(elt1)} - - , {((TemplateArgumentLoc*)Arguments.BeginX)[1],view(cpp)}{*this,view(elt2)} - - , {((TemplateArgumentLoc*)Arguments.BeginX)[2],view(cpp)}{*this,view(elt3)} - - , {((TemplateArgumentLoc*)Arguments.BeginX)[3],view(cpp)}{*this,view(elt4)} - - , ... - empty - <{*this,view(elt0)}> - Uninitialized - - - - {Arguments[0],view(cpp)}{*this,view(arg1)} - - , {Arguments[1],view(cpp)}{*this,view(arg2)} - - , {Arguments[1],view(cpp)}, ... - <{*this,view(arg0)}> - - NumArguments - - NumArguments - Arguments - - - - - - {Data[0],view(cpp)}{*this,view(arg1)} - - , {Data[1],view(cpp)}{*this,view(arg2)} - - , {Data[2],view(cpp)}, ... - <{*this,view(arg0)}> - - Length - - - - Length - Data - - - - - - - - {((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[0],view(cpp)}{*this,view(level1)} - - ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[1],view(cpp)}{*this,view(level2)} - - ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[2],view(cpp)}, ... - {*this,view(level0)} - - TemplateArgumentLists - - - - {(clang::QualType *)Arg,view(cpp)na} - Type template argument: {*(clang::QualType *)Arg} - Non-type template argument: {*(clang::Expr *)Arg} - Template template argument: {*(clang::TemplateName *)Arg - - Kind,en - (clang::QualType *)Arg - (clang::Expr *)Arg - (clang::TemplateName *)Arg - - - - - void - bool - char - unsigned char - wchar_t - char16_t - char32_t - unsigned short - unsigned int - unsigned long - unsigned long long - __uint128_t - char - signed char - wchar_t - short - int - long - long long - __int128_t - __fp16 - float - double - long double - nullptr_t - {(clang::BuiltinType::Kind)BuiltinTypeBits.Kind, en} - - (clang::BuiltinType::Kind)BuiltinTypeBits.Kind - - - - - - {((clang::TemplateArgument *)(this+1))[0],view(cpp)}{*this,view(arg1)} - - , {((clang::TemplateArgument *)(this+1))[1],view(cpp)}{*this,view(arg2)} - - , {((clang::TemplateArgument *)(this+1))[2],view(cpp)}{*this,view(arg3)} - - {*((clang::TemplateDecl *)(Template.Storage.Val.Value))->TemplatedDecl,view(cpp)}<{*this,view(arg0)}> - - Can't visualize this TemplateSpecializationType - - Template.Storage - - TemplateSpecializationTypeBits.NumArgs - (clang::TemplateArgument *)(this+1) - - *(clang::Type *)this, view(cmn) - - - - - (CanonicalType.Value.Value != this) || TypeBits.Dependent - *(clang::Type *)this,view(cmn) - - - - {CanonicalType,view(cpp)} - {Template,view(cpp)} - {Template} - - Template - CanonicalType,view(cpp) - (clang::DeducedType *)this - Template - - - - {*(CXXRecordDecl *)this,nd}{*TemplateArgs} - - (CXXRecordDecl *)this,nd - TemplateArgs - - - - {((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,sb} - - ((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,s - (clang::tok::TokenKind)TokenID - - - - - Empty - {*(clang::IdentifierInfo *)(Ptr & ~PtrMask)} - {{Identifier ({*(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} - {{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} - {{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} - {(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na} - C++ Constructor {{{(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na}}} - C++ Destructor {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} - C++ Conversion function {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} - C++ Operator {{*(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask)}} - {*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),view(cpp)} - {{Extra ({*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask)})}} - - StoredNameKind(Ptr & PtrMask),en - *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na - *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na - *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na - *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na - *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na - *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na - *(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask),na - (clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),na - - - - - {(CXXDeductionGuideNameExtra *)this,view(cpp)nand} - - - {(CXXDeductionGuideNameExtra *)this,nand} - - C++ Literal operator - C++ Using directive - Objective-C MultiArg selector - {(clang::detail::DeclarationNameExtra::ExtraKind)ExtraKindOrNumArgs,en}{" ",sb}{*this,view(cpp)} - - (CXXDeductionGuideNameExtra *)this - ExtraKindOrNumArgs - - - - {Template->TemplatedDecl,view(cpp)} - C++ Deduction guide for {Template->TemplatedDecl,view(cpp)na} - - - {Type,view(cpp)} - {Type} - - - {Name} - - - - {(ParsedTemplateArgument *)(this+1),view(cpp)na}{this,view(arg1)na} - - , {((ParsedTemplateArgument *)(this+1))+1,view(cpp)na}{this,view(arg2)na} - - , ... - {Name,na}<{this,view(arg0)na}> - - Name - - {this,view(arg0)na} - - - NumArgs - (ParsedTemplateArgument *)(this+1) - - - - Operator - - - - {{annot_template_id ({(clang::TemplateIdAnnotation *)(PtrData),na})}} - {{Identifier ({(clang::IdentifierInfo *)(PtrData),na})}} - {(clang::tok::TokenKind)Kind,en} - - - {BufferPtr,nasb} - - - {TheLexer._Mypair._Myval2,na} - Expanding Macro: {TheTokenLexer._Mypair._Myval2,na} - - - - - [{(Token *)(CachedTokens.BeginX) + CachedLexPos,na}] {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} - - {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} - {CurLexer._Mypair._Myval2,na} - Expanding Macro: {CurTokenLexer._Mypair._Myval2,na} - - - {this,view(cached)} - - CLK_LexAfterModuleImport - - - [{Tok}] {PP,na} - - - this - *this - {Id} - &{Id} - No visualizer for {Kind} - - - - =, - &, - - {(LambdaCapture *)(Captures.BeginX),na}{this,view(capture1)na} - - ,{(LambdaCapture *)(Captures.BeginX)+1,na}{this,view(capture2)na} - - ,{(LambdaCapture *)(Captures.BeginX)+2,na}{this,view(capture3)na} - - ,... - [{this,view(default)na}{this,view(capture0)na}] - - - - , [{TypeRep}] - - - , [{ExprRep}] - - - , [{DeclRep}] - - - [{(clang::DeclSpec::SCS)StorageClassSpec,en}], [{(clang::TypeSpecifierType)TypeSpecType,en}]{this,view(extra)na} - - (clang::DeclSpec::SCS)StorageClassSpec - (clang::TypeSpecifierType)TypeSpecType - - TypeRep - - - ExprRep - - - DeclRep - - - - - - {Name,s} - - - {RealPathName,s} - - - {Name,s} - - - - (clang::StorageClass)SClass - (clang::ThreadStorageClassSpecifier)TSCSpec - (clang::VarDecl::InitializationStyle)InitStyle - - - - {DeclType,view(left)} {Name,view(cpp)}{DeclType,view(right)} - - Name - DeclType - - - - {(DeclaratorDecl*)this,nand} - - (DeclaratorDecl*)this,nd - Init - VarDeclBits - - - - {*(VarDecl*)this,nd} - - ParmVarDeclBits - *(VarDecl*)this,nd - - - - {"explicit ",sb} - - explicit({ExplicitSpec,view(ptr)na}) - {ExplicitSpec,view(int)en} - {ExplicitSpec,view(int)en} : {ExplicitSpec,view(ptr)na} - - - {ExplicitSpec,view(cpp)}{Name,view(cpp)nd}({(FunctionDecl*)this,view(parm0)nand}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) & ~15))->BaseType)->ResultType,view(cpp)} - - ExplicitSpec - (bool)FunctionDeclBits.IsCopyDeductionCandidate - (FunctionDecl*)this,nd - - - - {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} - - {ParamInfo[0],na}{*this,view(parm1)nd} - - , {ParamInfo[1],na}{*this,view(parm2)nd} - - , {ParamInfo[2],na}{*this,view(parm3)nd} - - , {ParamInfo[3],na}{*this,view(parm4)nd} - - , {ParamInfo[4],na}{*this,view(parm5)nd} - - , /* expand for more params */ - - auto {Name,view(cpp)nd}({*this,view(parm0)nd}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} - - {this,view(retType)nand} {Name,view(cpp)nd}({*this,view(parm0)nd}) - - (clang::DeclaratorDecl *)this,nd - ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType - - {*this,view(parm0)nd} - - - ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->FunctionTypeBits.NumParams - ParamInfo - - - - TemplateOrSpecialization - - - - {*($T1*)&Ptr} - - ($T1*)&Ptr - - - - {($T1 *)Ptr} - - ($T1 *)Ptr - - - - - {*((NamedDecl **)(this+1))[0],view(cpp)}{*this,view(parm1)} - - , {*((NamedDecl **)(this+1))[1],view(cpp)}{*this,view(parm2)} - - , {*((NamedDecl **)(this+1))[2],view(cpp)}{*this,view(parm3)} - - , {*((NamedDecl **)(this+1))[3],view(cpp)}{*this,view(parm4)} - - , {*((NamedDecl **)(this+1))[4],view(cpp)}{*this,view(parm5)} - - , /* Expand for more params */ - <{*this,view(parm0)}> - - - NumParams - (NamedDecl **)(this+1) - - - - - {(clang::Stmt::StmtClass)StmtBits.sClass,en} - - (clang::Stmt::StmtClass)StmtBits.sClass,en - - - - {*(clang::StringLiteral *)this} - Expression of class {(clang::Stmt::StmtClass)StmtBits.sClass,en} and type {TR,view(cpp)} - - - - *(unsigned *)(((clang::StringLiteral *)this)+1) - (const char *)(((clang::StringLiteral *)this)+1)+4+4,[*(unsigned *)(((clang::StringLiteral *)this)+1)]s8 - - - - public - protected - private - - {*(clang::NamedDecl *)(Ptr&~Mask)} - {*this,view(access)} {*this,view(decl)} - - (clang::AccessSpecifier)(Ptr&Mask),en - *(clang::NamedDecl *)(Ptr&~Mask) - - - - [IK_Identifier] {*Identifier} - [IK_OperatorFunctionId] {OperatorFunctionId} - [IK_ConversionFunctionId] {ConversionFunctionId} - [IK_ConstructorName] {ConstructorName} - [IK_DestructorName] {DestructorName} - [IK_DeductionGuideName] {TemplateName} - [IK_TemplateId] {TemplateId} - [IK_ConstructorTemplateId] {TemplateId} - Kind - - Identifier - OperatorFunctionId - ConversionFunctionId - ConstructorName - DestructorName - TemplateName - TemplateId - TemplateId - - - - NumDecls={NumDecls} - - - NumDecls - (Decl **)(this+1) - - - - - {*D} - {*(DeclGroup *)((uintptr_t)D&~1)} - - D - (DeclGroup *)((uintptr_t)D&~1) - - - - {DS} {Name} - - - {Decls} - - Decls - - - - {Ambiguity,en}: {Decls} - {ResultKind,en}: {Decls} - - - Invalid - Unset - {Val} - - - Invalid - Unset - {($T1)(Value&~1)} - - (bool)(Value&1) - ($T1)(Value&~1) - - - + + + + + + + LocInfoType + {(clang::Type::TypeClass)TypeBits.TC, en}Type + + {*(clang::BuiltinType *)this} + {*(clang::PointerType *)this} + {*(clang::ParenType *)this} + {(clang::BitIntType *)this} + {*(clang::LValueReferenceType *)this} + {*(clang::RValueReferenceType *)this} + {(clang::ConstantArrayType *)this,na} + {(clang::ConstantArrayType *)this,view(left)na} + {(clang::ConstantArrayType *)this,view(right)na} + {(clang::VariableArrayType *)this,na} + {(clang::VariableArrayType *)this,view(left)na} + {(clang::VariableArrayType *)this,view(right)na} + {(clang::IncompleteArrayType *)this,na} + {(clang::IncompleteArrayType *)this,view(left)na} + {(clang::IncompleteArrayType *)this,view(right)na} + {(clang::TypedefType *)this,na} + {(clang::TypedefType *)this,view(cpp)na} + {*(clang::AttributedType *)this} + {(clang::DecayedType *)this,na} + {(clang::DecayedType *)this,view(left)na} + {(clang::DecayedType *)this,view(right)na} + {(clang::ElaboratedType *)this,na} + {(clang::ElaboratedType *)this,view(left)na} + {(clang::ElaboratedType *)this,view(right)na} + {*(clang::TemplateTypeParmType *)this} + {*(clang::TemplateTypeParmType *)this,view(cpp)} + {*(clang::SubstTemplateTypeParmType *)this} + {*(clang::RecordType *)this} + {*(clang::RecordType *)this,view(cpp)} + {(clang::FunctionProtoType *)this,na} + {(clang::FunctionProtoType *)this,view(left)na} + {(clang::FunctionProtoType *)this,view(right)na} + {*(clang::TemplateSpecializationType *)this} + {*(clang::DeducedTemplateSpecializationType *)this} + {*(clang::DeducedTemplateSpecializationType *)this,view(cpp)} + {*(clang::InjectedClassNameType *)this} + {*(clang::DependentNameType *)this} + {*(clang::PackExpansionType *)this} + {(clang::LocInfoType *)this,na} + {(clang::LocInfoType *)this,view(cpp)na} + {this,view(poly)na} + {*this,view(cpp)} + + No visualizer yet for {(clang::Type::TypeClass)TypeBits.TC,en}Type + Dependence{" ",en} + + CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en} CachedLocalOrUnnamed + CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en}{" ",sb} + + FromAST + + + No TypeBits set beyond TypeClass + + {*this, view(Dependence)}{*this, view(Cache)}{*this, view(FromAST)} + {*this,view(cmn)} {{{*this,view(poly)}}} + + (clang::Type::TypeClass)TypeBits.TC + this,view(flags)na + CanonicalType + *(clang::BuiltinType *)this + *(clang::PointerType *)this + *(clang::ParenType*)this + *(clang::BitIntType*)this + *(clang::LValueReferenceType *)this + *(clang::RValueReferenceType *)this + (clang::ConstantArrayType *)this + (clang::VariableArrayType *)this + (clang::IncompleteArrayType *)this + *(clang::AttributedType *)this + (clang::DecayedType *)this + (clang::ElaboratedType *)this + (clang::TemplateTypeParmType *)this + (clang::SubstTemplateTypeParmType *)this + (clang::RecordType *)this + (clang::FunctionProtoType *)this + (clang::TemplateSpecializationType *)this + (clang::DeducedTemplateSpecializationType *)this + (clang::InjectedClassNameType *)this + (clang::DependentNameType *)this + (clang::PackExpansionType *)this + (clang::LocInfoType *)this + + + + + ElementType + + + + {ElementType,view(cpp)} + [{Size}] + {ElementType,view(cpp)}[{Size}] + + Size + (clang::ArrayType *)this + + + + {ElementType,view(cpp)} + [] + {ElementType,view(cpp)}[] + + (clang::ArrayType *)this + + + + {ElementType,view(cpp)} + [*] + {ElementType,view(cpp)}[*] + + (clang::Expr *)SizeExpr + (clang::ArrayType *)this + + + + {Decl,view(name)nd} + {Decl} + + Decl + *(clang::Type *)this, view(cmn) + + + + {PointeeType, view(cpp)} * + + PointeeType + *(clang::Type *)this, view(cmn) + + + + {Inner, view(cpp)} + + Inner + *(clang::Type *)this, view(cmn) + + + + signed _BitInt({NumBits}) + unsigned _BitInt({NumBits})( + + NumBits + (clang::Type *)this, view(cmn) + + + + + {((clang::ReferenceType *)this)->PointeeType,view(cpp)} & + + *(clang::Type *)this, view(cmn) + PointeeType + + + + {((clang::ReferenceType *)this)->PointeeType,view(cpp)} && + + *(clang::Type *)this, view(cmn) + PointeeType + + + + {ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind} + + + + + {(clang::Decl::Kind)DeclContextBits.DeclKind,en}Decl + + (clang::Decl::Kind)DeclContextBits.DeclKind,en + + + + + FirstDecl + (clang::Decl *)(*(intptr_t *)NextInContextAndBits.Value.Data & ~3) + *this + + + + + + + Field {{{*(clang::DeclaratorDecl *)this,view(cpp)nd}}} + + + {*(clang::FunctionDecl *)this,nd} + Method {{{*this,view(cpp)}}} + + + Constructor {{{Name,view(cpp)}({*(clang::FunctionDecl *)this,view(parm0)nd})}} + + + Destructor {{~{Name,view(cpp)}()}} + + + typename + class + (not yet known if parameter pack) + ... + + {(TypeSourceInfo *)(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data&~3LL),view(cpp)} + {{InheritedInitializer}} + = {this,view(DefaultArg)na} + + {*this,view(TorC)} {*this,view(MaybeEllipses)}{Name,view(cpp)} {this,view(Initializer)na} + + + {*TemplatedDecl,view(cpp)} + template{TemplateParams,na} {*TemplatedDecl}; + + TemplateParams,na + TemplatedDecl,na + + + + + {(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} + {(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} + {(TypeDecl *)this,view(cpp)nand} + typedef {this,view(type)na} {this,view(name)na}; + + "Not yet calculated",sb + (bool)(*(uintptr_t *)MaybeModedTInfo.Value.Data & 2) + (clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) + (clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) + (TypeDecl *)this,nd + + + + {(TypedefNameDecl *)this,view(name)nand} + using {(TypedefNameDecl *)this,view(name)nand} = {(TypedefNameDecl *)this,view(type)nand} + + + {Name} + + + Kind={(UncommonTemplateNameStorage::Kind)Kind,en}, Size={Size} + + (UncommonTemplateNameStorage::Kind)Kind + Size + + + + {Bits}, + {this,view(cmn)na},{(OverloadedTemplateStorage*)this,na} + {this,view(cmn)na},{(AssumedTemplateStorage*)this,na} + {this,view(cmn)na},{(SubstTemplateTemplateParmStorage*)this,na} + {this,view(cmn)na},{(SubstTemplateTemplateParmPackStorage*)this,na} + {this,view(cmn)na} + + Bits + (OverloadedTemplateStorage*)this + (AssumedTemplateStorage*)this + (SubstTemplateTemplateParmStorage*)this + (SubstTemplateTemplateParmPackStorage*)this + + + + + + + {(clang::TemplateDecl *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::TemplateDecl *)(Val.Value & ~3LL),na} + + + {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),na} + + + {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),na} + + + {(clang::DependentTemplateName *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::DependentTemplateName *)(Val.Value & ~3LL),na} + + + "TemplateDecl",s8b + + (clang::TemplateDecl *)(Val.Value & ~3LL) + + "UncommonTemplateNameStorage",s8b + + (clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL) + + "QualifiedTemplateName",s8b + + (clang::QualifiedTemplateName *)(Val.Value & ~3LL) + + "DependentTemplateName",s8b + + (clang::DependentTemplateName *)(Val.Value & ~3LL) + + Val + + + + + {Storage,view(cpp)na} + {Storage,na} + + Storage + + + + {Name,view(cpp)} + {Name} + + + implicit{" ",sb} + + {*this,view(implicit)nd} + {*this,view(modifiers)}{Name,view(cpp)} + {*this,view(modifiers)nd}struct {Name,view(cpp)} + {*this,view(modifiers)nd}interface {Name,view(cpp)} + {*this,view(modifiers)nd}union {Name,view(cpp)} + {*this,view(modifiers)nd}class {Name,view(cpp)} + {*this,view(modifiers)nd}enum {Name,view(cpp)} + + (clang::DeclContext *)this + + + + {decl,view(cpp)na} + {*decl} + + *(clang::Type *)this, view(cmn) + decl + + + + {(clang::TagType *)this,view(cpp)na} + {(clang::TagType *)this,na} + + *(clang::TagType *)this + + + + {{{*Replaced,view(cpp)} <= {CanonicalType,view(cpp)}}} + + *(clang::Type *)this, view(cmn) + *Replaced + + + + + + {ResultType,view(cpp)} + + {*(clang::QualType *)(this+1),view(cpp)}{*this,view(parm1)} + + , {*((clang::QualType *)(this+1)+1),view(cpp)}{*this,view(parm2)} + + , {*((clang::QualType *)(this+1)+2),view(cpp)}{*this,view(parm3)} + + , {*((clang::QualType *)(this+1)+3),view(cpp)}{*this,view(parm4)} + + , {*((clang::QualType *)(this+1)+4),view(cpp)}{*this,view(parm5)} + + , /* expand for more params */ + ({*this,view(parm0)}) -> {ResultType,view(cpp)} + ({*this,view(parm0)}) + {this,view(left)na}{this,view(right)na} + + ResultType + + {*this,view(parm0)} + + + FunctionTypeBits.NumParams + (clang::QualType *)(this+1) + + + + *(clang::Type *)this, view(cmn) + + + + + {OriginalTy} adjusted to {AdjustedTy} + + OriginalTy + AdjustedTy + + + + {OriginalTy,view(left)} + {OriginalTy,view(right)} + {OriginalTy} + + (clang::AdjustedType *)this + + + + {NamedType,view(left)} + {NamedType,view(right)} + {NamedType} + + (clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword + NNS + NamedType,view(cmn) + + + + {TTPDecl->Name,view(cpp)} + Non-canonical: {*TTPDecl} + Canonical: {CanTTPTInfo} + + *(clang::Type *)this, view(cmn) + + + + {Decl,view(cpp)} + + Decl + InjectedType + *(clang::Type *)this, view(cmn) + + + + {NNS}{Name,view(cpp)na} + + NNS + Name + *(clang::Type *)this, view(cmn) + + + + + {(IdentifierInfo*)Specifier,view(cpp)na}:: + {(NamedDecl*)Specifier,view(cpp)na}:: + {(Type*)Specifier,view(cpp)na}:: + + (NestedNameSpecifier::StoredSpecifierKind)((*(uintptr_t *)Prefix.Value.Data>>1)&3) + + + + {Pattern} + + Pattern + NumExpansions + *(clang::Type *)this, view(cmn) + + + + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(poly)}{*this,view(fastQuals)} + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(cpp)}{*this,view(fastQuals)} + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(left)}{*this,view(fastQuals)} + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(right)}{*this,view(fastQuals)} + + + {" ",sb}const + {" ",sb}restrict + {" ",sb}const restrict + {" ",sb}volatile + {" ",sb}const volatile + {" ",sb}volatile restrict + {" ",sb}const volatile restrict + Cannot visualize non-fast qualifiers + Null + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,na}{*this,view(fastQuals)} + + *this,view(fastQuals) + ((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType + + + + + {DeclInfo,view(cpp)na} + {DeclInfo,na} + + DeclInfo + *(clang::Type *)this, view(cmn) + + + + {Ty,view(cpp)} + {Ty} + + Ty + + + + {(QualType *)&Ty,na} + + (QualType *)&Ty + Data + + + + Not building anything + Building a {LastTy} + + + {Argument,view(cpp)} + {Argument} + + + {*(clang::QualType *)&TypeOrValue.V,view(cpp)} + {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} template argument: {*(clang::QualType *)&TypeOrValue.V} + + {Args.Args[0]}{*this,view(arg1)} + + , {Args.Args[1]}{*this,view(arg2)} + + , {Args.Args[2]}, ... + + {Args.Args[0],view(cpp)}{*this,view(arg1cpp)} + + , {Args.Args[1],view(cpp)}{*this,view(arg2cpp)} + + , {Args.Args[2],view(cpp)}, ... + {*this,view(arg0cpp)} + {*this,view(arg0)} + {(clang::Expr *)TypeOrValue.V,view(cpp)na} + {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} + + *(clang::QualType *)&TypeOrValue.V + (clang::Expr *)TypeOrValue.V + + Args.NumArgs + Args.Args + + + + + + + {((TemplateArgumentLoc*)Arguments.BeginX)[0],view(cpp)}{*this,view(elt1)} + + , {((TemplateArgumentLoc*)Arguments.BeginX)[1],view(cpp)}{*this,view(elt2)} + + , {((TemplateArgumentLoc*)Arguments.BeginX)[2],view(cpp)}{*this,view(elt3)} + + , {((TemplateArgumentLoc*)Arguments.BeginX)[3],view(cpp)}{*this,view(elt4)} + + , ... + empty + <{*this,view(elt0)}> + Uninitialized + + + + {Arguments[0],view(cpp)}{*this,view(arg1)} + + , {Arguments[1],view(cpp)}{*this,view(arg2)} + + , {Arguments[1],view(cpp)}, ... + <{*this,view(arg0)}> + + NumArguments + + NumArguments + Arguments + + + + + + {Data[0],view(cpp)}{*this,view(arg1)} + + , {Data[1],view(cpp)}{*this,view(arg2)} + + , {Data[2],view(cpp)}, ... + <{*this,view(arg0)}> + + Length + + + + Length + Data + + + + + + + + {((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[0],view(cpp)}{*this,view(level1)} + + ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[1],view(cpp)}{*this,view(level2)} + + ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[2],view(cpp)}, ... + {*this,view(level0)} + + TemplateArgumentLists + + + + {(clang::QualType *)Arg,view(cpp)na} + Type template argument: {*(clang::QualType *)Arg} + Non-type template argument: {*(clang::Expr *)Arg} + Template template argument: {*(clang::TemplateName *)Arg + + Kind,en + (clang::QualType *)Arg + (clang::Expr *)Arg + (clang::TemplateName *)Arg + + + + + void + bool + char + unsigned char + wchar_t + char16_t + char32_t + unsigned short + unsigned int + unsigned long + unsigned long long + __uint128_t + char + signed char + wchar_t + short + int + long + long long + __int128_t + __fp16 + float + double + long double + nullptr_t + {(clang::BuiltinType::Kind)BuiltinTypeBits.Kind, en} + + (clang::BuiltinType::Kind)BuiltinTypeBits.Kind + + + + + + {((clang::TemplateArgument *)(this+1))[0],view(cpp)}{*this,view(arg1)} + + , {((clang::TemplateArgument *)(this+1))[1],view(cpp)}{*this,view(arg2)} + + , {((clang::TemplateArgument *)(this+1))[2],view(cpp)}{*this,view(arg3)} + + {*((clang::TemplateDecl *)(Template.Storage.Val.Value))->TemplatedDecl,view(cpp)}<{*this,view(arg0)}> + + Can't visualize this TemplateSpecializationType + + Template.Storage + + TemplateSpecializationTypeBits.NumArgs + (clang::TemplateArgument *)(this+1) + + *(clang::Type *)this, view(cmn) + + + + + (CanonicalType.Value.Value != this) || TypeBits.Dependent + *(clang::Type *)this,view(cmn) + + + + {CanonicalType,view(cpp)} + {Template,view(cpp)} + {Template} + + Template + CanonicalType,view(cpp) + (clang::DeducedType *)this + Template + + + + {*(CXXRecordDecl *)this,nd}{*TemplateArgs} + + (CXXRecordDecl *)this,nd + TemplateArgs + + + + {((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,sb} + + ((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,s + (clang::tok::TokenKind)TokenID + + + + + Empty + {*(clang::IdentifierInfo *)(Ptr & ~PtrMask)} + {{Identifier ({*(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} + {{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} + {{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} + {(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na} + C++ Constructor {{{(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na}}} + C++ Destructor {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} + C++ Conversion function {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} + C++ Operator {{*(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask)}} + {*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),view(cpp)} + {{Extra ({*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask)})}} + + StoredNameKind(Ptr & PtrMask),en + *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na + *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na + *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na + *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na + *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na + *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na + *(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask),na + (clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),na + + + + + {(CXXDeductionGuideNameExtra *)this,view(cpp)nand} + + + {(CXXDeductionGuideNameExtra *)this,nand} + + C++ Literal operator + C++ Using directive + Objective-C MultiArg selector + {(clang::detail::DeclarationNameExtra::ExtraKind)ExtraKindOrNumArgs,en}{" ",sb}{*this,view(cpp)} + + (CXXDeductionGuideNameExtra *)this + ExtraKindOrNumArgs + + + + {Template->TemplatedDecl,view(cpp)} + C++ Deduction guide for {Template->TemplatedDecl,view(cpp)na} + + + {Type,view(cpp)} + {Type} + + + {Name} + + + + {(ParsedTemplateArgument *)(this+1),view(cpp)na}{this,view(arg1)na} + + , {((ParsedTemplateArgument *)(this+1))+1,view(cpp)na}{this,view(arg2)na} + + , ... + {Name,na}<{this,view(arg0)na}> + + Name + + {this,view(arg0)na} + + + NumArgs + (ParsedTemplateArgument *)(this+1) + + + + Operator + + + + {{annot_template_id ({(clang::TemplateIdAnnotation *)(PtrData),na})}} + {{Identifier ({(clang::IdentifierInfo *)(PtrData),na})}} + {(clang::tok::TokenKind)Kind,en} + + + {BufferPtr,nasb} + + + {TheLexer._Mypair._Myval2,na} + Expanding Macro: {TheTokenLexer._Mypair._Myval2,na} + + + + + [{(Token *)(CachedTokens.BeginX) + CachedLexPos,na}] {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} + + {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} + {CurLexer._Mypair._Myval2,na} + Expanding Macro: {CurTokenLexer._Mypair._Myval2,na} + + + {this,view(cached)} + + CLK_LexAfterModuleImport + + + [{Tok}] {PP,na} + + + this + *this + {Id} + &{Id} + No visualizer for {Kind} + + + + =, + &, + + {(LambdaCapture *)(Captures.BeginX),na}{this,view(capture1)na} + + ,{(LambdaCapture *)(Captures.BeginX)+1,na}{this,view(capture2)na} + + ,{(LambdaCapture *)(Captures.BeginX)+2,na}{this,view(capture3)na} + + ,... + [{this,view(default)na}{this,view(capture0)na}] + + + + , [{TypeRep}] + + + , [{ExprRep}] + + + , [{DeclRep}] + + + [{(clang::DeclSpec::SCS)StorageClassSpec,en}], [{(clang::TypeSpecifierType)TypeSpecType,en}]{this,view(extra)na} + + (clang::DeclSpec::SCS)StorageClassSpec + (clang::TypeSpecifierType)TypeSpecType + + TypeRep + + + ExprRep + + + DeclRep + + + + + + {Name,s} + + + {RealPathName,s} + + + {Name,s} + + + + (clang::StorageClass)SClass + (clang::ThreadStorageClassSpecifier)TSCSpec + (clang::VarDecl::InitializationStyle)InitStyle + + + + {DeclType,view(left)} {Name,view(cpp)}{DeclType,view(right)} + + Name + DeclType + + + + {(DeclaratorDecl*)this,nand} + + (DeclaratorDecl*)this,nd + Init + VarDeclBits + + + + {*(VarDecl*)this,nd} + + ParmVarDeclBits + *(VarDecl*)this,nd + + + + {"explicit ",sb} + + explicit({ExplicitSpec,view(ptr)na}) + {ExplicitSpec,view(int)en} + {ExplicitSpec,view(int)en} : {ExplicitSpec,view(ptr)na} + + + {ExplicitSpec,view(cpp)}{Name,view(cpp)nd}({(FunctionDecl*)this,view(parm0)nand}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) & ~15))->BaseType)->ResultType,view(cpp)} + + ExplicitSpec + (bool)FunctionDeclBits.IsCopyDeductionCandidate + (FunctionDecl*)this,nd + + + + {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} + + {ParamInfo[0],na}{*this,view(parm1)nd} + + , {ParamInfo[1],na}{*this,view(parm2)nd} + + , {ParamInfo[2],na}{*this,view(parm3)nd} + + , {ParamInfo[3],na}{*this,view(parm4)nd} + + , {ParamInfo[4],na}{*this,view(parm5)nd} + + , /* expand for more params */ + + auto {Name,view(cpp)nd}({*this,view(parm0)nd}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} + + {this,view(retType)nand} {Name,view(cpp)nd}({*this,view(parm0)nd}) + + (clang::DeclaratorDecl *)this,nd + ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType + + {*this,view(parm0)nd} + + + ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->FunctionTypeBits.NumParams + ParamInfo + + + + TemplateOrSpecialization + + + + {*($T1*)&Ptr} + + ($T1*)&Ptr + + + + {($T1 *)Ptr} + + ($T1 *)Ptr + + + + + {*((NamedDecl **)(this+1))[0],view(cpp)}{*this,view(parm1)} + + , {*((NamedDecl **)(this+1))[1],view(cpp)}{*this,view(parm2)} + + , {*((NamedDecl **)(this+1))[2],view(cpp)}{*this,view(parm3)} + + , {*((NamedDecl **)(this+1))[3],view(cpp)}{*this,view(parm4)} + + , {*((NamedDecl **)(this+1))[4],view(cpp)}{*this,view(parm5)} + + , /* Expand for more params */ + <{*this,view(parm0)}> + + + NumParams + (NamedDecl **)(this+1) + + + + + {(clang::Stmt::StmtClass)StmtBits.sClass,en} + + (clang::Stmt::StmtClass)StmtBits.sClass,en + + + + {*(clang::StringLiteral *)this} + Expression of class {(clang::Stmt::StmtClass)StmtBits.sClass,en} and type {TR,view(cpp)} + + + + *(unsigned *)(((clang::StringLiteral *)this)+1) + (const char *)(((clang::StringLiteral *)this)+1)+4+4,[*(unsigned *)(((clang::StringLiteral *)this)+1)]s8 + + + + public + protected + private + + {*(clang::NamedDecl *)(Ptr&~Mask)} + {*this,view(access)} {*this,view(decl)} + + (clang::AccessSpecifier)(Ptr&Mask),en + *(clang::NamedDecl *)(Ptr&~Mask) + + + + [IK_Identifier] {*Identifier} + [IK_OperatorFunctionId] {OperatorFunctionId} + [IK_ConversionFunctionId] {ConversionFunctionId} + [IK_ConstructorName] {ConstructorName} + [IK_DestructorName] {DestructorName} + [IK_DeductionGuideName] {TemplateName} + [IK_TemplateId] {TemplateId} + [IK_ConstructorTemplateId] {TemplateId} + Kind + + Identifier + OperatorFunctionId + ConversionFunctionId + ConstructorName + DestructorName + TemplateName + TemplateId + TemplateId + + + + NumDecls={NumDecls} + + + NumDecls + (Decl **)(this+1) + + + + + {*D} + {*(DeclGroup *)((uintptr_t)D&~1)} + + D + (DeclGroup *)((uintptr_t)D&~1) + + + + {DS} {Name} + + + {Decls} + + Decls + + + + {Ambiguity,en}: {Decls} + {ResultKind,en}: {Decls} + + + Invalid + Unset + {Val} + + + Invalid + Unset + {($T1)(Value&~1)} + + (bool)(Value&1) + ($T1)(Value&~1) + + + diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp index 325d63de1563de2226c0e7622b499fdba46e349b..34e2e8f47ae71a0f949a55ba0506fa7688fe20c8 100644 --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -83,7 +83,7 @@ getCategoryFromDiagGroup(const Record *Group, static std::string getDiagnosticCategory(const Record *R, DiagGroupParentMap &DiagGroupParents) { // If the diagnostic is in a group, and that group has a category, use it. - if (DefInit *Group = dyn_cast(R->getValueInit("Group"))) { + if (const auto *Group = dyn_cast(R->getValueInit("Group"))) { // Check the diagnostic's diag group for a category. std::string CatName = getCategoryFromDiagGroup(Group->getDef(), DiagGroupParents); @@ -161,7 +161,7 @@ static void groupDiagnostics(ArrayRef Diags, for (unsigned i = 0, e = Diags.size(); i != e; ++i) { const Record *R = Diags[i]; - DefInit *DI = dyn_cast(R->getValueInit("Group")); + const auto *DI = dyn_cast(R->getValueInit("Group")); if (!DI) continue; assert(R->getValueAsDef("Class")->getName() != "CLASS_NOTE" && @@ -359,7 +359,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic, const Record *R = Diags[i]; if (isExtension(R) && isOffByDefault(R)) { DiagsSet.insert(R); - if (DefInit *Group = dyn_cast(R->getValueInit("Group"))) { + if (const auto *Group = dyn_cast(R->getValueInit("Group"))) { const Record *GroupRec = Group->getDef(); if (!isSubGroupOfGroup(GroupRec, "pedantic")) { markGroup(GroupRec); @@ -378,13 +378,13 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic, // Check if the group is implicitly in -Wpedantic. If so, // the diagnostic should not be directly included in the -Wpedantic // diagnostic group. - if (DefInit *Group = dyn_cast(R->getValueInit("Group"))) + if (const auto *Group = dyn_cast(R->getValueInit("Group"))) if (groupInPedantic(Group->getDef())) continue; // The diagnostic is not included in a group that is (transitively) in // -Wpedantic. Include it in -Wpedantic directly. - if (RecordVec *V = DiagsInPedantic.dyn_cast()) + if (auto *V = DiagsInPedantic.dyn_cast()) V->push_back(R); else { DiagsInPedantic.get()->insert(R); @@ -413,7 +413,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic, if (Parents.size() > 0 && AllParentsInPedantic) continue; - if (RecordVec *V = GroupsInPedantic.dyn_cast()) + if (auto *V = GroupsInPedantic.dyn_cast()) V->push_back(Group); else { GroupsInPedantic.get()->insert(Group); @@ -1443,7 +1443,7 @@ void clang::EmitClangDiagsDefs(const RecordKeeper &Records, raw_ostream &OS, // Check if this is an error that is accidentally in a warning // group. if (isError(R)) { - if (DefInit *Group = dyn_cast(R.getValueInit("Group"))) { + if (const auto *Group = dyn_cast(R.getValueInit("Group"))) { const Record *GroupRec = Group->getDef(); const std::string &GroupName = std::string(GroupRec->getValueAsString("GroupName")); @@ -1478,7 +1478,7 @@ void clang::EmitClangDiagsDefs(const RecordKeeper &Records, raw_ostream &OS, // Warning group associated with the diagnostic. This is stored as an index // into the alphabetically sorted warning group table. - if (DefInit *DI = dyn_cast(R.getValueInit("Group"))) { + if (const auto *DI = dyn_cast(R.getValueInit("Group"))) { std::map::iterator I = DiagsInGroup.find( std::string(DI->getDef()->getValueAsString("GroupName"))); assert(I != DiagsInGroup.end()); diff --git a/clang/utils/TableGen/ClangSACheckersEmitter.cpp b/clang/utils/TableGen/ClangSACheckersEmitter.cpp index bebdcac32126132bfd7d18c7bfdcc09d7cd4af5d..36012dbf70791b743479a58386358e00b78b9ce1 100644 --- a/clang/utils/TableGen/ClangSACheckersEmitter.cpp +++ b/clang/utils/TableGen/ClangSACheckersEmitter.cpp @@ -29,7 +29,7 @@ static std::string getPackageFullName(const Record *R, StringRef Sep = "."); static std::string getParentPackageFullName(const Record *R, StringRef Sep = ".") { std::string name; - if (DefInit *DI = dyn_cast(R->getValueInit("ParentPackage"))) + if (const DefInit *DI = dyn_cast(R->getValueInit("ParentPackage"))) name = getPackageFullName(DI->getDef(), Sep); return name; } @@ -53,7 +53,7 @@ static std::string getCheckerFullName(const Record *R, StringRef Sep = ".") { } static std::string getStringValue(const Record &R, StringRef field) { - if (StringInit *SI = dyn_cast(R.getValueInit(field))) + if (const StringInit *SI = dyn_cast(R.getValueInit(field))) return std::string(SI->getValue()); return std::string(); } @@ -94,7 +94,7 @@ static std::string getCheckerDocs(const Record &R) { /// the class itself has to be modified for adding a new option type in /// CheckerBase.td. static std::string getCheckerOptionType(const Record &R) { - if (BitsInit *BI = R.getValueAsBitsInit("Type")) { + if (const BitsInit *BI = R.getValueAsBitsInit("Type")) { switch(getValueFromBitsInit(BI, R)) { case 0: return "int"; @@ -111,7 +111,7 @@ static std::string getCheckerOptionType(const Record &R) { } static std::string getDevelopmentStage(const Record &R) { - if (BitsInit *BI = R.getValueAsBitsInit("DevelopmentStage")) { + if (const BitsInit *BI = R.getValueAsBitsInit("DevelopmentStage")) { switch(getValueFromBitsInit(BI, R)) { case 0: return "alpha"; @@ -131,7 +131,7 @@ static bool isHidden(const Record *R) { return true; // Not declared as hidden, check the parent package if it is hidden. - if (DefInit *DI = dyn_cast(R->getValueInit("ParentPackage"))) + if (const DefInit *DI = dyn_cast(R->getValueInit("ParentPackage"))) return isHidden(DI->getDef()); return false; diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index 915e914d6b9287b8b3e06e7690c9c969b7b77c3a..51e570944b49b6a4f3de5307b756d12a2c2aa5c7 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1033,15 +1033,15 @@ public: // to expand Tablegen classes like 'Vector' which mean something different in // each member of a parametric family. const Type *getType(const Record *R, const Type *Param); - const Type *getType(DagInit *D, const Type *Param); - const Type *getType(Init *I, const Type *Param); + const Type *getType(const DagInit *D, const Type *Param); + const Type *getType(const Init *I, const Type *Param); // Functions that translate the Tablegen representation of an intrinsic's // code generation into a collection of Value objects (which will then be // reprocessed to read out the actual C++ code included by CGBuiltin.cpp). - Result::Ptr getCodeForDag(DagInit *D, const Result::Scope &Scope, + Result::Ptr getCodeForDag(const DagInit *D, const Result::Scope &Scope, const Type *Param); - Result::Ptr getCodeForDagArg(DagInit *D, unsigned ArgNum, + Result::Ptr getCodeForDagArg(const DagInit *D, unsigned ArgNum, const Result::Scope &Scope, const Type *Param); Result::Ptr getCodeForArg(unsigned ArgNum, const Type *ArgType, bool Promote, bool Immediate); @@ -1060,10 +1060,10 @@ public: void EmitBuiltinAliases(raw_ostream &OS); }; -const Type *EmitterBase::getType(Init *I, const Type *Param) { - if (auto Dag = dyn_cast(I)) +const Type *EmitterBase::getType(const Init *I, const Type *Param) { + if (const auto *Dag = dyn_cast(I)) return getType(Dag, Param); - if (auto Def = dyn_cast(I)) + if (const auto *Def = dyn_cast(I)) return getType(Def->getDef(), Param); PrintFatalError("Could not convert this value into a type"); @@ -1088,7 +1088,7 @@ const Type *EmitterBase::getType(const Record *R, const Type *Param) { PrintFatalError(R->getLoc(), "Could not convert this record into a type"); } -const Type *EmitterBase::getType(DagInit *D, const Type *Param) { +const Type *EmitterBase::getType(const DagInit *D, const Type *Param) { // The meat of the getType system: types in the Tablegen are represented by a // dag whose operators select sub-cases of this function. @@ -1156,7 +1156,8 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) { PrintFatalError("Bad operator in type dag expression"); } -Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope, +Result::Ptr EmitterBase::getCodeForDag(const DagInit *D, + const Result::Scope &Scope, const Type *Param) { const Record *Op = cast(D->getOperator())->getDef(); @@ -1199,14 +1200,14 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope, Result::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param); const Type *Ty = nullptr; - if (auto *DI = dyn_cast(D->getArg(0))) + if (const auto *DI = dyn_cast(D->getArg(0))) if (auto *PTy = dyn_cast(getType(DI->getOperator(), Param))) Ty = PTy->getPointeeType(); if (!Ty) PrintFatalError("'address' pointer argument should be a pointer"); unsigned Alignment; - if (auto *II = dyn_cast(D->getArg(1))) { + if (const auto *II = dyn_cast(D->getArg(1))) { Alignment = II->getValue(); } else { PrintFatalError("'address' alignment argument should be an integer"); @@ -1267,10 +1268,10 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope, } } -Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum, +Result::Ptr EmitterBase::getCodeForDagArg(const DagInit *D, unsigned ArgNum, const Result::Scope &Scope, const Type *Param) { - Init *Arg = D->getArg(ArgNum); + const Init *Arg = D->getArg(ArgNum); StringRef Name = D->getArgNameStr(ArgNum); if (!Name.empty()) { @@ -1286,18 +1287,18 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum, // Sometimes the Arg is a bit. Prior to multiclass template argument // checking, integers would sneak through the bit declaration, // but now they really are bits. - if (auto *BI = dyn_cast(Arg)) + if (const auto *BI = dyn_cast(Arg)) return std::make_shared(getScalarType("u32"), BI->getValue()); - if (auto *II = dyn_cast(Arg)) + if (const auto *II = dyn_cast(Arg)) return std::make_shared(getScalarType("u32"), II->getValue()); - if (auto *DI = dyn_cast(Arg)) + if (const auto *DI = dyn_cast(Arg)) return getCodeForDag(DI, Scope, Param); - if (auto *DI = dyn_cast(Arg)) { + if (const auto *DI = dyn_cast(Arg)) { const Record *Rec = DI->getDef(); if (Rec->isSubClassOf("Type")) { const Type *T = getType(Rec, Param); @@ -1307,7 +1308,7 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum, PrintError("bad DAG argument type for code generation"); PrintNote("DAG: " + D->getAsString()); - if (TypedInit *Typed = dyn_cast(Arg)) + if (const auto *Typed = dyn_cast(Arg)) PrintNote("argument type: " + Typed->getType()->getAsString()); PrintFatalNote("argument number " + Twine(ArgNum) + ": " + Arg->getAsString()); } @@ -1379,13 +1380,13 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R, HeaderOnly = R->getValueAsBit("headerOnly"); // Process the intrinsic's argument list. - DagInit *ArgsDag = R->getValueAsDag("args"); + const DagInit *ArgsDag = R->getValueAsDag("args"); Result::Scope Scope; for (unsigned i = 0, e = ArgsDag->getNumArgs(); i < e; ++i) { - Init *TypeInit = ArgsDag->getArg(i); + const Init *TypeInit = ArgsDag->getArg(i); bool Promote = true; - if (auto TypeDI = dyn_cast(TypeInit)) + if (const auto *TypeDI = dyn_cast(TypeInit)) if (TypeDI->getDef()->isSubClassOf("unpromoted")) Promote = false; @@ -1397,7 +1398,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R, // If the argument is a subclass of Immediate, record the details about // what values it can take, for Sema checking. bool Immediate = false; - if (auto TypeDI = dyn_cast(TypeInit)) { + if (const auto *TypeDI = dyn_cast(TypeInit)) { const Record *TypeRec = TypeDI->getDef(); if (TypeRec->isSubClassOf("Immediate")) { Immediate = true; @@ -1444,7 +1445,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R, // Finally, go through the codegen dag and translate it into a Result object // (with an arbitrary DAG of depended-on Results hanging off it). - DagInit *CodeDag = R->getValueAsDag("codegen"); + const DagInit *CodeDag = R->getValueAsDag("codegen"); const Record *MainOp = cast(CodeDag->getOperator())->getDef(); if (MainOp->isSubClassOf("CustomCodegen")) { // Or, if it's the special case of CustomCodegen, just accumulate @@ -1456,9 +1457,9 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R, StringRef Name = CodeDag->getArgNameStr(i); if (Name.empty()) { PrintFatalError("Operands to CustomCodegen should have names"); - } else if (auto *II = dyn_cast(CodeDag->getArg(i))) { + } else if (const auto *II = dyn_cast(CodeDag->getArg(i))) { CustomCodeGenArgs[std::string(Name)] = itostr(II->getValue()); - } else if (auto *SI = dyn_cast(CodeDag->getArg(i))) { + } else if (const auto *SI = dyn_cast(CodeDag->getArg(i))) { CustomCodeGenArgs[std::string(Name)] = std::string(SI->getValue()); } else { PrintFatalError("Operands to CustomCodegen should be integers"); diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index d4b42360e7fd32a07395995a33e66de3d2db1489..adff7c70219bbf0cf4f1d794c3428f68105582f7 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -321,7 +321,7 @@ class Intrinsic { ClassKind CK; /// The list of DAGs for the body. May be empty, in which case we should /// emit a builtin call. - ListInit *Body; + const ListInit *Body; /// The architectural ifdef guard. std::string ArchGuard; /// The architectural target() guard. @@ -372,9 +372,9 @@ class Intrinsic { public: Intrinsic(const Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS, - TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter, - StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable, - bool BigEndianSafe) + TypeSpec InTS, ClassKind CK, const ListInit *Body, + NeonEmitter &Emitter, StringRef ArchGuard, StringRef TargetGuard, + bool IsUnavailable, bool BigEndianSafe) : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), CK(CK), Body(Body), ArchGuard(ArchGuard.str()), TargetGuard(TargetGuard.str()), IsUnavailable(IsUnavailable), BigEndianSafe(BigEndianSafe), @@ -554,19 +554,20 @@ private: DagEmitter(Intrinsic &Intr, StringRef CallPrefix) : Intr(Intr), CallPrefix(CallPrefix) { } - std::pair emitDagArg(Init *Arg, std::string ArgName); - std::pair emitDagSaveTemp(DagInit *DI); - std::pair emitDagSplat(DagInit *DI); - std::pair emitDagDup(DagInit *DI); - std::pair emitDagDupTyped(DagInit *DI); - std::pair emitDagShuffle(DagInit *DI); - std::pair emitDagCast(DagInit *DI, bool IsBitCast); - std::pair emitDagCall(DagInit *DI, + std::pair emitDagArg(const Init *Arg, + std::string ArgName); + std::pair emitDagSaveTemp(const DagInit *DI); + std::pair emitDagSplat(const DagInit *DI); + std::pair emitDagDup(const DagInit *DI); + std::pair emitDagDupTyped(const DagInit *DI); + std::pair emitDagShuffle(const DagInit *DI); + std::pair emitDagCast(const DagInit *DI, bool IsBitCast); + std::pair emitDagCall(const DagInit *DI, bool MatchMangledName); - std::pair emitDagNameReplace(DagInit *DI); - std::pair emitDagLiteral(DagInit *DI); - std::pair emitDagOp(DagInit *DI); - std::pair emitDag(DagInit *DI); + std::pair emitDagNameReplace(const DagInit *DI); + std::pair emitDagLiteral(const DagInit *DI); + std::pair emitDagOp(const DagInit *DI); + std::pair emitDag(const DagInit *DI); }; }; @@ -1410,9 +1411,9 @@ void Intrinsic::emitBody(StringRef CallPrefix) { // We have a list of "things to output". The last should be returned. for (auto *I : Body->getValues()) { - if (StringInit *SI = dyn_cast(I)) { + if (const auto *SI = dyn_cast(I)) { Lines.push_back(replaceParamsIn(SI->getAsString())); - } else if (DagInit *DI = dyn_cast(I)) { + } else if (const auto *DI = dyn_cast(I)) { DagEmitter DE(*this, CallPrefix); Lines.push_back(DE.emitDag(DI).second + ";"); } @@ -1438,9 +1439,9 @@ void Intrinsic::emitReturn() { emitNewLine(); } -std::pair Intrinsic::DagEmitter::emitDag(DagInit *DI) { +std::pair Intrinsic::DagEmitter::emitDag(const DagInit *DI) { // At this point we should only be seeing a def. - DefInit *DefI = cast(DI->getOperator()); + const DefInit *DefI = cast(DI->getOperator()); std::string Op = DefI->getAsString(); if (Op == "cast" || Op == "bitcast") @@ -1467,7 +1468,8 @@ std::pair Intrinsic::DagEmitter::emitDag(DagInit *DI) { return std::make_pair(Type::getVoid(), ""); } -std::pair Intrinsic::DagEmitter::emitDagOp(DagInit *DI) { +std::pair +Intrinsic::DagEmitter::emitDagOp(const DagInit *DI) { std::string Op = cast(DI->getArg(0))->getAsUnquotedString(); if (DI->getNumArgs() == 2) { // Unary op. @@ -1486,7 +1488,7 @@ std::pair Intrinsic::DagEmitter::emitDagOp(DagInit *DI) { } std::pair -Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) { +Intrinsic::DagEmitter::emitDagCall(const DagInit *DI, bool MatchMangledName) { std::vector Types; std::vector Values; for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) { @@ -1498,7 +1500,7 @@ Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) { // Look up the called intrinsic. std::string N; - if (StringInit *SI = dyn_cast(DI->getArg(0))) + if (const auto *SI = dyn_cast(DI->getArg(0))) N = SI->getAsUnquotedString(); else N = emitDagArg(DI->getArg(0), "").second; @@ -1529,8 +1531,8 @@ Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) { return std::make_pair(Callee.getReturnType(), S); } -std::pair Intrinsic::DagEmitter::emitDagCast(DagInit *DI, - bool IsBitCast){ +std::pair +Intrinsic::DagEmitter::emitDagCast(const DagInit *DI, bool IsBitCast) { // (cast MOD* VAL) -> cast VAL to type given by MOD. std::pair R = emitDagArg(DI->getArg(DI->getNumArgs() - 1), @@ -1552,7 +1554,7 @@ std::pair Intrinsic::DagEmitter::emitDagCast(DagInit *DI, castToType = Intr.Variables[std::string(DI->getArgNameStr(ArgIdx))].getType(); } else { - StringInit *SI = dyn_cast(DI->getArg(ArgIdx)); + const auto *SI = dyn_cast(DI->getArg(ArgIdx)); assert_with_loc(SI, "Expected string type or $Name for cast type"); if (SI->getAsUnquotedString() == "R") { @@ -1599,7 +1601,8 @@ std::pair Intrinsic::DagEmitter::emitDagCast(DagInit *DI, return std::make_pair(castToType, S); } -std::pair Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){ +std::pair +Intrinsic::DagEmitter::emitDagShuffle(const DagInit *DI) { // See the documentation in arm_neon.td for a description of these operators. class LowHalf : public SetTheory::Operator { public: @@ -1710,7 +1713,8 @@ std::pair Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){ return std::make_pair(T, S); } -std::pair Intrinsic::DagEmitter::emitDagDup(DagInit *DI) { +std::pair +Intrinsic::DagEmitter::emitDagDup(const DagInit *DI) { assert_with_loc(DI->getNumArgs() == 1, "dup() expects one argument"); std::pair A = emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0))); @@ -1729,7 +1733,8 @@ std::pair Intrinsic::DagEmitter::emitDagDup(DagInit *DI) { return std::make_pair(T, S); } -std::pair Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) { +std::pair +Intrinsic::DagEmitter::emitDagDupTyped(const DagInit *DI) { assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments"); std::pair B = emitDagArg(DI->getArg(1), std::string(DI->getArgNameStr(1))); @@ -1737,7 +1742,7 @@ std::pair Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) "dup_typed() requires a scalar as the second argument"); Type T; // If the type argument is a constant string, construct the type directly. - if (StringInit *SI = dyn_cast(DI->getArg(0))) { + if (const auto *SI = dyn_cast(DI->getArg(0))) { T = Type::fromTypedefName(SI->getAsUnquotedString()); assert_with_loc(!T.isVoid(), "Unknown typedef"); } else @@ -1755,7 +1760,8 @@ std::pair Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) return std::make_pair(T, S); } -std::pair Intrinsic::DagEmitter::emitDagSplat(DagInit *DI) { +std::pair +Intrinsic::DagEmitter::emitDagSplat(const DagInit *DI) { assert_with_loc(DI->getNumArgs() == 2, "splat() expects two arguments"); std::pair A = emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0))); @@ -1774,7 +1780,8 @@ std::pair Intrinsic::DagEmitter::emitDagSplat(DagInit *DI) { return std::make_pair(Intr.getBaseType(), S); } -std::pair Intrinsic::DagEmitter::emitDagSaveTemp(DagInit *DI) { +std::pair +Intrinsic::DagEmitter::emitDagSaveTemp(const DagInit *DI) { assert_with_loc(DI->getNumArgs() == 2, "save_temp() expects two arguments"); std::pair A = emitDagArg(DI->getArg(1), std::string(DI->getArgNameStr(1))); @@ -1797,7 +1804,7 @@ std::pair Intrinsic::DagEmitter::emitDagSaveTemp(DagInit *DI) } std::pair -Intrinsic::DagEmitter::emitDagNameReplace(DagInit *DI) { +Intrinsic::DagEmitter::emitDagNameReplace(const DagInit *DI) { std::string S = Intr.Name; assert_with_loc(DI->getNumArgs() == 2, "name_replace requires 2 arguments!"); @@ -1812,14 +1819,15 @@ Intrinsic::DagEmitter::emitDagNameReplace(DagInit *DI) { return std::make_pair(Type::getVoid(), S); } -std::pair Intrinsic::DagEmitter::emitDagLiteral(DagInit *DI){ +std::pair +Intrinsic::DagEmitter::emitDagLiteral(const DagInit *DI) { std::string Ty = cast(DI->getArg(0))->getAsUnquotedString(); std::string Value = cast(DI->getArg(1))->getAsUnquotedString(); return std::make_pair(Type::fromTypedefName(Ty), Value); } std::pair -Intrinsic::DagEmitter::emitDagArg(Init *Arg, std::string ArgName) { +Intrinsic::DagEmitter::emitDagArg(const Init *Arg, std::string ArgName) { if (!ArgName.empty()) { assert_with_loc(!Arg->isComplete(), "Arguments must either be DAGs or names, not both!"); @@ -1830,7 +1838,7 @@ Intrinsic::DagEmitter::emitDagArg(Init *Arg, std::string ArgName) { } assert(Arg && "Neither ArgName nor Arg?!"); - DagInit *DI = dyn_cast(Arg); + const auto *DI = dyn_cast(Arg); assert_with_loc(DI, "Arguments must either be DAGs or names!"); return emitDag(DI); @@ -1994,7 +2002,7 @@ void NeonEmitter::createIntrinsic(const Record *R, // decent location information even when highly nested. CurrentRecord = R; - ListInit *Body = OperationRec->getValueAsListInit("Ops"); + const ListInit *Body = OperationRec->getValueAsListInit("Ops"); std::vector TypeSpecs = TypeSpec::fromTypeSpecs(Types); diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 82bbd04f97b4f9c633a75b16f3b76ad4508e8fbb..1d79cc71dd977b516561558a09d5f1050fab251e 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -51,7 +51,7 @@ using TypeSpec = std::string; namespace { class SVEType { - bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat; + bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat, MFloat; bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp, Svcount; unsigned Bitwidth, ElementBitwidth, NumVectors; @@ -61,10 +61,10 @@ public: SVEType(StringRef TS, char CharMod, unsigned NumVectors = 1) : Float(false), Signed(true), Immediate(false), Void(false), - Constant(false), Pointer(false), BFloat(false), DefaultType(false), - IsScalable(true), Predicate(false), PredicatePattern(false), - PrefetchOp(false), Svcount(false), Bitwidth(128), ElementBitwidth(~0U), - NumVectors(NumVectors) { + Constant(false), Pointer(false), BFloat(false), MFloat(false), + DefaultType(false), IsScalable(true), Predicate(false), + PredicatePattern(false), PrefetchOp(false), Svcount(false), + Bitwidth(128), ElementBitwidth(~0U), NumVectors(NumVectors) { if (!TS.empty()) applyTypespec(TS); applyModifier(CharMod); @@ -82,11 +82,14 @@ public: bool isVector() const { return NumVectors > 0; } bool isScalableVector() const { return isVector() && IsScalable; } bool isFixedLengthVector() const { return isVector() && !IsScalable; } - bool isChar() const { return ElementBitwidth == 8; } + bool isChar() const { return ElementBitwidth == 8 && !MFloat; } bool isVoid() const { return Void && !Pointer; } bool isDefault() const { return DefaultType; } - bool isFloat() const { return Float && !BFloat; } - bool isBFloat() const { return BFloat && !Float; } + bool isFloat() const { return Float && !BFloat && !MFloat; } + bool isBFloat() const { return BFloat && !Float && !MFloat; } + bool isMFloat() const { + return MFloat && !BFloat && !Float; + } bool isFloatingPoint() const { return Float || BFloat; } bool isInteger() const { return !isFloatingPoint() && !Predicate && !Svcount; @@ -454,6 +457,9 @@ std::string SVEType::builtin_str() const { else if (isBFloat()) { assert(ElementBitwidth == 16 && "Not a valid BFloat."); S += "y"; + } else if (isMFloat()) { + assert(ElementBitwidth == 8 && "Not a valid MFloat."); + S += "m"; } if (!isFloatingPoint()) { @@ -509,6 +515,8 @@ std::string SVEType::str() const { S += "bool"; else if (isBFloat()) S += "bfloat"; + else if (isMFloat()) + S += "mfloat"; else S += "int"; @@ -572,8 +580,16 @@ void SVEType::applyTypespec(StringRef TS) { case 'b': BFloat = true; Float = false; + MFloat = false; ElementBitwidth = 16; break; + case 'm': + Signed = false; + MFloat = true; + Float = false; + BFloat = false; + ElementBitwidth = 8; + break; default: llvm_unreachable("Unhandled type code!"); } @@ -1037,6 +1053,8 @@ std::string Intrinsic::replaceTemplatedArgs(std::string Name, TypeSpec TS, TypeCode = 'b'; else if (T.isBFloat()) TypeCode = "bf"; + else if (T.isMFloat()) + TypeCode = "mfp"; else TypeCode = 'f'; Ret.replace(Pos, NumChars, TypeCode + utostr(T.getElementSizeInBits())); @@ -1130,6 +1148,11 @@ uint64_t SVEEmitter::encodeTypeFlags(const SVEType &T) { return encodeEltType("EltTyBFloat16"); } + if (T.isMFloat()) { + assert(T.getElementSizeInBits() == 8 && "Not a valid MFloat."); + return encodeEltType("EltTyMFloat8"); + } + if (T.isPredicateVector() || T.isSvcount()) { switch (T.getElementSizeInBits()) { case 8: @@ -1305,6 +1328,8 @@ void SVEEmitter::createHeader(raw_ostream &OS) { OS << "#include \n"; OS << "#include \n"; + OS << "typedef __SVMfloat8_t svmfloat8_t;\n\n"; + OS << "typedef __SVFloat32_t svfloat32_t;\n"; OS << "typedef __SVFloat64_t svfloat64_t;\n"; OS << "typedef __clang_svint8x2_t svint8x2_t;\n"; diff --git a/cmake/Modules/CMakePolicy.cmake b/cmake/Modules/CMakePolicy.cmake index 665af01d43bd2475064c654b4615387086a41d02..f19dfd7165717116573e06ca8a95d43d8fa0c0ed 100644 --- a/cmake/Modules/CMakePolicy.cmake +++ b/cmake/Modules/CMakePolicy.cmake @@ -1,10 +1,5 @@ # CMake policy settings shared between LLVM projects -# CMP0114: ExternalProject step targets fully adopt their steps. -# New in CMake 3.19: https://cmake.org/cmake/help/latest/policy/CMP0114.html -if(POLICY CMP0114) - cmake_policy(SET CMP0114 OLD) -endif() # CMP0116: Ninja generators transform `DEPFILE`s from `add_custom_command()` # New in CMake 3.20. https://cmake.org/cmake/help/latest/policy/CMP0116.html if(POLICY CMP0116) diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index b9df3266fbcf8fa188ec5748aefaa768f9803d60..c66b0465a0b54854c28dd92729bb812c49278135 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -303,6 +303,18 @@ COVMAP_HEADER(uint32_t, Int32Ty, Version, \ #undef COVMAP_HEADER /* COVMAP_HEADER end. */ +/* COVINIT_FUNC start */ +#ifndef COVINIT_FUNC +#define COVINIT_FUNC(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_DATA_DEFINED +#endif +COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), WriteoutFunction, \ + WriteoutF) +COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), ResetFunction, \ + ResetF) +#undef COVINIT_FUNC +/* COVINIT_FUNC end */ #ifdef INSTR_PROF_SECT_ENTRY #define INSTR_PROF_DATA_DEFINED @@ -345,6 +357,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_covdata, \ INSTR_PROF_SECT_ENTRY(IPSK_covname, \ INSTR_PROF_QUOTE(INSTR_PROF_COVNAME_COMMON), \ INSTR_PROF_COVNAME_COFF, "__LLVM_COV,") +INSTR_PROF_SECT_ENTRY(IPSK_covinit, \ + INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON), \ + INSTR_PROF_COVINIT_COFF, "__LLVM_COV,") #undef INSTR_PROF_SECT_ENTRY #endif @@ -761,6 +776,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_COVDATA_COMMON __llvm_covdata #define INSTR_PROF_COVNAME_COMMON __llvm_covnames #define INSTR_PROF_ORDERFILE_COMMON __llvm_orderfile +#define INSTR_PROF_COVINIT_COMMON __llvm_covinit + /* Windows section names. Because these section names contain dollar characters, * they must be quoted. */ @@ -781,6 +798,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_COVNAME_COFF ".lcovn" #define INSTR_PROF_ORDERFILE_COFF ".lorderfile$M" +// FIXME: Placeholder for Windows. Windows currently does not initialize +// the GCOV functions in the runtime. +#define INSTR_PROF_COVINIT_COFF ".lcovd$M" + #ifdef _WIN32 /* Runtime section names and name strings. */ #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF @@ -800,6 +821,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_COVDATA_SECT_NAME INSTR_PROF_COVDATA_COFF #define INSTR_PROF_COVNAME_SECT_NAME INSTR_PROF_COVNAME_COFF #define INSTR_PROF_ORDERFILE_SECT_NAME INSTR_PROF_ORDERFILE_COFF +#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_COVINIT_COFF #else /* Runtime section names and name strings. */ #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON) @@ -821,6 +843,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, /* Order file instrumentation. */ #define INSTR_PROF_ORDERFILE_SECT_NAME \ INSTR_PROF_QUOTE(INSTR_PROF_ORDERFILE_COMMON) +#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON) #endif #define INSTR_PROF_ORDERFILE_BUFFER_NAME _llvm_order_file_buffer diff --git a/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp b/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp index 688bad1d51ca5bc3454b7b7a6d8bc0c8f52b676d..dfc32ac9db2979b9f8f686a250c4013e84db4c26 100644 --- a/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp @@ -22,6 +22,11 @@ using namespace fuzzer; #define STRINGIFY(A) STRINGIFY_(A) #if LIBFUZZER_MSVC +#define GET_FUNCTION_ADDRESS(fn) &fn +#else +#define GET_FUNCTION_ADDRESS(fn) __builtin_function_start(fn) +#endif // LIBFUZER_MSVC + // Copied from compiler-rt/lib/sanitizer_common/sanitizer_win_defs.h #if defined(_M_IX86) || defined(__i386__) #define WIN_SYM_PREFIX "_" @@ -31,17 +36,9 @@ using namespace fuzzer; // Declare external functions as having alternativenames, so that we can // determine if they are not defined. -#define EXTERNAL_FUNC(Name, Default) \ - __pragma(comment(linker, "/alternatename:" WIN_SYM_PREFIX STRINGIFY( \ +#define EXTERNAL_FUNC(Name, Default) \ + __pragma(comment(linker, "/alternatename:" WIN_SYM_PREFIX STRINGIFY( \ Name) "=" WIN_SYM_PREFIX STRINGIFY(Default))) -#else -// Declare external functions as weak to allow them to default to a specified -// function if not defined explicitly. We must use weak symbols because clang's -// support for alternatename is not 100%, see -// https://bugs.llvm.org/show_bug.cgi?id=40218 for more details. -#define EXTERNAL_FUNC(Name, Default) \ - __attribute__((weak, alias(STRINGIFY(Default)))) -#endif // LIBFUZZER_MSVC extern "C" { #define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN) \ @@ -57,20 +54,23 @@ extern "C" { } template -static T *GetFnPtr(T *Fun, T *FunDef, const char *FnName, bool WarnIfMissing) { +static T *GetFnPtr(void *Fun, void *FunDef, const char *FnName, + bool WarnIfMissing) { if (Fun == FunDef) { if (WarnIfMissing) Printf("WARNING: Failed to find function \"%s\".\n", FnName); return nullptr; } - return Fun; + return (T *)Fun; } namespace fuzzer { ExternalFunctions::ExternalFunctions() { -#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN) \ - this->NAME = GetFnPtr(::NAME, ::NAME##Def, #NAME, WARN); +#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN) \ + this->NAME = GetFnPtr(GET_FUNCTION_ADDRESS(::NAME), \ + GET_FUNCTION_ADDRESS(::NAME##Def), \ + #NAME, WARN); #include "FuzzerExtFunctions.def" diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index c05e0dd0a9332da405de368ca19fcdb416278c48..721db7872cce8312ebce0ae0b4fae8dc2ba52758 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -399,26 +399,123 @@ static void ProcessThreadRegistry(Frontier *frontier) { } // Scans thread data (stacks and TLS) for heap pointers. +static void ProcessThread(tid_t os_id, uptr sp, + const InternalMmapVector ®isters, + InternalMmapVector &extra_ranges, + Frontier *frontier) { + // `extra_ranges` is outside of the function and the loop to reused mapped + // memory. + CHECK(extra_ranges.empty()); + LOG_THREADS("Processing thread %llu.\n", os_id); + uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end; + DTLS *dtls; + bool thread_found = + GetThreadRangesLocked(os_id, &stack_begin, &stack_end, &tls_begin, + &tls_end, &cache_begin, &cache_end, &dtls); + if (!thread_found) { + // If a thread can't be found in the thread registry, it's probably in the + // process of destruction. Log this event and move on. + LOG_THREADS("Thread %llu not found in registry.\n", os_id); + return; + } + + if (!sp) + sp = stack_begin; + + if (flags()->use_registers) { + uptr registers_begin = reinterpret_cast(registers.data()); + uptr registers_end = + reinterpret_cast(registers.data() + registers.size()); + ScanRangeForPointers(registers_begin, registers_end, frontier, "REGISTERS", + kReachable); + } + + if (flags()->use_stacks) { + LOG_THREADS("Stack at %p-%p (SP = %p).\n", (void *)stack_begin, + (void *)stack_end, (void *)sp); + if (sp < stack_begin || sp >= stack_end) { + // SP is outside the recorded stack range (e.g. the thread is running a + // signal handler on alternate stack, or swapcontext was used). + // Again, consider the entire stack range to be reachable. + LOG_THREADS("WARNING: stack pointer not in stack range.\n"); + uptr page_size = GetPageSizeCached(); + int skipped = 0; + while (stack_begin < stack_end && + !IsAccessibleMemoryRange(stack_begin, 1)) { + skipped++; + stack_begin += page_size; + } + LOG_THREADS("Skipped %d guard page(s) to obtain stack %p-%p.\n", skipped, + (void *)stack_begin, (void *)stack_end); + } else { + // Shrink the stack range to ignore out-of-scope values. + stack_begin = sp; + } + ScanRangeForPointers(stack_begin, stack_end, frontier, "STACK", kReachable); + GetThreadExtraStackRangesLocked(os_id, &extra_ranges); + ScanExtraStackRanges(extra_ranges, frontier); + } + + if (flags()->use_tls) { + if (tls_begin) { + LOG_THREADS("TLS at %p-%p.\n", (void *)tls_begin, (void *)tls_end); + // If the tls and cache ranges don't overlap, scan full tls range, + // otherwise, only scan the non-overlapping portions + if (cache_begin == cache_end || tls_end < cache_begin || + tls_begin > cache_end) { + ScanRangeForPointers(tls_begin, tls_end, frontier, "TLS", kReachable); + } else { + if (tls_begin < cache_begin) + ScanRangeForPointers(tls_begin, cache_begin, frontier, "TLS", + kReachable); + if (tls_end > cache_end) + ScanRangeForPointers(cache_end, tls_end, frontier, "TLS", kReachable); + } + } +# if SANITIZER_ANDROID + auto *cb = +[](void *dtls_begin, void *dtls_end, uptr /*dso_idd*/, + void *arg) -> void { + ScanRangeForPointers( + reinterpret_cast(dtls_begin), reinterpret_cast(dtls_end), + reinterpret_cast(arg), "DTLS", kReachable); + }; + + // FIXME: There might be a race-condition here (and in Bionic) if the + // thread is suspended in the middle of updating its DTLS. IOWs, we + // could scan already freed memory. (probably fine for now) + __libc_iterate_dynamic_tls(os_id, cb, frontier); +# else + if (dtls && !DTLSInDestruction(dtls)) { + ForEachDVT(dtls, [&](const DTLS::DTV &dtv, int id) { + uptr dtls_beg = dtv.beg; + uptr dtls_end = dtls_beg + dtv.size; + if (dtls_beg < dtls_end) { + LOG_THREADS("DTLS %d at %p-%p.\n", id, (void *)dtls_beg, + (void *)dtls_end); + ScanRangeForPointers(dtls_beg, dtls_end, frontier, "DTLS", + kReachable); + } + }); + } else { + // We are handling a thread with DTLS under destruction. Log about + // this and continue. + LOG_THREADS("Thread %llu has DTLS under destruction.\n", os_id); + } +# endif + } +} + static void ProcessThreads(SuspendedThreadsList const &suspended_threads, Frontier *frontier, tid_t caller_tid, uptr caller_sp) { InternalMmapVector registers; InternalMmapVector extra_ranges; for (uptr i = 0; i < suspended_threads.ThreadCount(); i++) { - tid_t os_id = static_cast(suspended_threads.GetThreadID(i)); - LOG_THREADS("Processing thread %llu.\n", os_id); - uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end; - DTLS *dtls; - bool thread_found = - GetThreadRangesLocked(os_id, &stack_begin, &stack_end, &tls_begin, - &tls_end, &cache_begin, &cache_end, &dtls); - if (!thread_found) { - // If a thread can't be found in the thread registry, it's probably in the - // process of destruction. Log this event and move on. - LOG_THREADS("Thread %llu not found in registry.\n", os_id); - continue; - } - uptr sp; + registers.clear(); + extra_ranges.clear(); + + const tid_t os_id = suspended_threads.GetThreadID(i); + uptr sp = 0; PtraceRegistersStatus have_registers = suspended_threads.GetRegistersAndSP(i, ®isters, &sp); if (have_registers != REGISTERS_AVAILABLE) { @@ -427,97 +524,13 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads, // GetRegistersAndSP failed with ESRCH. if (have_registers == REGISTERS_UNAVAILABLE_FATAL) continue; - sp = stack_begin; - } - if (suspended_threads.GetThreadID(i) == caller_tid) { - sp = caller_sp; + sp = 0; } - if (flags()->use_registers && have_registers) { - uptr registers_begin = reinterpret_cast(registers.data()); - uptr registers_end = - reinterpret_cast(registers.data() + registers.size()); - ScanRangeForPointers(registers_begin, registers_end, frontier, - "REGISTERS", kReachable); - } - - if (flags()->use_stacks) { - LOG_THREADS("Stack at %p-%p (SP = %p).\n", (void *)stack_begin, - (void *)stack_end, (void *)sp); - if (sp < stack_begin || sp >= stack_end) { - // SP is outside the recorded stack range (e.g. the thread is running a - // signal handler on alternate stack, or swapcontext was used). - // Again, consider the entire stack range to be reachable. - LOG_THREADS("WARNING: stack pointer not in stack range.\n"); - uptr page_size = GetPageSizeCached(); - int skipped = 0; - while (stack_begin < stack_end && - !IsAccessibleMemoryRange(stack_begin, 1)) { - skipped++; - stack_begin += page_size; - } - LOG_THREADS("Skipped %d guard page(s) to obtain stack %p-%p.\n", - skipped, (void *)stack_begin, (void *)stack_end); - } else { - // Shrink the stack range to ignore out-of-scope values. - stack_begin = sp; - } - ScanRangeForPointers(stack_begin, stack_end, frontier, "STACK", - kReachable); - extra_ranges.clear(); - GetThreadExtraStackRangesLocked(os_id, &extra_ranges); - ScanExtraStackRanges(extra_ranges, frontier); - } + if (os_id == caller_tid) + sp = caller_sp; - if (flags()->use_tls) { - if (tls_begin) { - LOG_THREADS("TLS at %p-%p.\n", (void *)tls_begin, (void *)tls_end); - // If the tls and cache ranges don't overlap, scan full tls range, - // otherwise, only scan the non-overlapping portions - if (cache_begin == cache_end || tls_end < cache_begin || - tls_begin > cache_end) { - ScanRangeForPointers(tls_begin, tls_end, frontier, "TLS", kReachable); - } else { - if (tls_begin < cache_begin) - ScanRangeForPointers(tls_begin, cache_begin, frontier, "TLS", - kReachable); - if (tls_end > cache_end) - ScanRangeForPointers(cache_end, tls_end, frontier, "TLS", - kReachable); - } - } -# if SANITIZER_ANDROID - auto *cb = +[](void *dtls_begin, void *dtls_end, uptr /*dso_idd*/, - void *arg) -> void { - ScanRangeForPointers(reinterpret_cast(dtls_begin), - reinterpret_cast(dtls_end), - reinterpret_cast(arg), "DTLS", - kReachable); - }; - - // FIXME: There might be a race-condition here (and in Bionic) if the - // thread is suspended in the middle of updating its DTLS. IOWs, we - // could scan already freed memory. (probably fine for now) - __libc_iterate_dynamic_tls(os_id, cb, frontier); -# else - if (dtls && !DTLSInDestruction(dtls)) { - ForEachDVT(dtls, [&](const DTLS::DTV &dtv, int id) { - uptr dtls_beg = dtv.beg; - uptr dtls_end = dtls_beg + dtv.size; - if (dtls_beg < dtls_end) { - LOG_THREADS("DTLS %d at %p-%p.\n", id, (void *)dtls_beg, - (void *)dtls_end); - ScanRangeForPointers(dtls_beg, dtls_end, frontier, "DTLS", - kReachable); - } - }); - } else { - // We are handling a thread with DTLS under destruction. Log about - // this and continue. - LOG_THREADS("Thread %llu has DTLS under destruction.\n", os_id); - } -# endif - } + ProcessThread(os_id, sp, registers, extra_ranges, frontier); } // Add pointers reachable from ThreadContexts @@ -712,11 +725,11 @@ static bool ReportUnsuspendedThreads( Sort(threads.data(), threads.size()); - InternalMmapVector unsuspended; - GetRunningThreadsLocked(&unsuspended); + InternalMmapVector known_threads; + GetRunningThreadsLocked(&known_threads); bool succeded = true; - for (auto os_id : unsuspended) { + for (auto os_id : known_threads) { uptr i = InternalLowerBound(threads, os_id); if (i >= threads.size() || threads[i] != os_id) { succeded = false; diff --git a/compiler-rt/lib/orc/dlfcn_wrapper.cpp b/compiler-rt/lib/orc/dlfcn_wrapper.cpp index bbbc79f607f27053e8cd3b8cb88d762b41d2dc3d..dec8d1e5bbc31e8457ffd630ad97ffcf43e431e6 100644 --- a/compiler-rt/lib/orc/dlfcn_wrapper.cpp +++ b/compiler-rt/lib/orc/dlfcn_wrapper.cpp @@ -20,7 +20,7 @@ using namespace orc_rt; extern "C" const char *__orc_rt_jit_dlerror(); extern "C" void *__orc_rt_jit_dlopen(const char *path, int mode); -extern "C" int __orc_rt_jit_dlupdate(void *dso_handle, int mode); +extern "C" int __orc_rt_jit_dlupdate(void *dso_handle); extern "C" int __orc_rt_jit_dlclose(void *dso_handle); ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult @@ -45,10 +45,10 @@ __orc_rt_jit_dlopen_wrapper(const char *ArgData, size_t ArgSize) { #ifdef __APPLE__ ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult __orc_rt_jit_dlupdate_wrapper(const char *ArgData, size_t ArgSize) { - return WrapperFunction::handle( + return WrapperFunction::handle( ArgData, ArgSize, - [](ExecutorAddr &DSOHandle, int32_t mode) { - return __orc_rt_jit_dlupdate(DSOHandle.toPtr(), mode); + [](ExecutorAddr &DSOHandle) { + return __orc_rt_jit_dlupdate(DSOHandle.toPtr()); }) .release(); } diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp index afd90c791ae1357111f635ffc32ae9da205bf05f..8ca68587aeb363f057d8388b095364ce9aac1dc6 100644 --- a/compiler-rt/lib/orc/macho_platform.cpp +++ b/compiler-rt/lib/orc/macho_platform.cpp @@ -245,7 +245,7 @@ public: const char *dlerror(); void *dlopen(std::string_view Name, int Mode); - int dlupdate(void *DSOHandle, int Mode); + int dlupdate(void *DSOHandle); int dlclose(void *DSOHandle); void *dlsym(void *DSOHandle, const char *Symbol); @@ -295,7 +295,7 @@ private: Error dlopenInitialize(std::unique_lock &JDStatesLock, JITDylibState &JDS, MachOJITDylibDepInfoMap &DepInfo); - Error dlupdateImpl(void *DSOHandle, int Mode); + Error dlupdateImpl(void *DSOHandle); Error dlupdateFull(std::unique_lock &JDStatesLock, JITDylibState &JDS); Error dlupdateInitialize(std::unique_lock &JDStatesLock, @@ -710,13 +710,13 @@ void *MachOPlatformRuntimeState::dlopen(std::string_view Path, int Mode) { } } -int MachOPlatformRuntimeState::dlupdate(void *DSOHandle, int Mode) { +int MachOPlatformRuntimeState::dlupdate(void *DSOHandle) { ORC_RT_DEBUG({ std::string S; printdbg("MachOPlatform::dlupdate(%p) (%s)\n", DSOHandle, S.c_str()); }); std::lock_guard Lock(DyldAPIMutex); - if (auto Err = dlupdateImpl(DSOHandle, Mode)) { + if (auto Err = dlupdateImpl(DSOHandle)) { // FIXME: Make dlerror thread safe. DLFcnError = toString(std::move(Err)); return -1; @@ -1179,7 +1179,7 @@ Error MachOPlatformRuntimeState::dlopenInitialize( return Error::success(); } -Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle, int Mode) { +Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle) { std::unique_lock Lock(JDStatesMutex); // Try to find JITDylib state by DSOHandle. @@ -1513,8 +1513,8 @@ void *__orc_rt_macho_jit_dlopen(const char *path, int mode) { return MachOPlatformRuntimeState::get().dlopen(path, mode); } -int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode) { - return MachOPlatformRuntimeState::get().dlupdate(dso_handle, mode); +int __orc_rt_macho_jit_dlupdate(void *dso_handle) { + return MachOPlatformRuntimeState::get().dlupdate(dso_handle); } int __orc_rt_macho_jit_dlclose(void *dso_handle) { diff --git a/compiler-rt/lib/orc/macho_platform.h b/compiler-rt/lib/orc/macho_platform.h index ad70c97809d2f6939766a406c3c584d82f5c101c..aeab248f7f8ae461e09416d68f4c86ae36cc233f 100644 --- a/compiler-rt/lib/orc/macho_platform.h +++ b/compiler-rt/lib/orc/macho_platform.h @@ -24,7 +24,7 @@ ORC_RT_INTERFACE void __orc_rt_macho_cxa_finalize(void *dso_handle); // dlfcn functions. ORC_RT_INTERFACE const char *__orc_rt_macho_jit_dlerror(); ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlopen(const char *path, int mode); -ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode); +ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle); ORC_RT_INTERFACE int __orc_rt_macho_jit_dlclose(void *dso_handle); ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlsym(void *dso_handle, const char *symbol); diff --git a/compiler-rt/lib/profile/GCDAProfiling.c b/compiler-rt/lib/profile/GCDAProfiling.c index f67d95d21a7b541c1b709fd4a944ef10b94a11a7..ac01805e70adc4711bbf37b9ec79ea8462a8b8d7 100644 --- a/compiler-rt/lib/profile/GCDAProfiling.c +++ b/compiler-rt/lib/profile/GCDAProfiling.c @@ -624,6 +624,25 @@ void llvm_gcov_init(fn_ptr wfn, fn_ptr rfn) { } } +#if defined(_AIX) +COMPILER_RT_VISIBILITY __attribute__((constructor)) void +__llvm_profile_gcov_initialize() { + const __llvm_gcov_init_func_struct *InitFuncStart = + __llvm_profile_begin_covinit(); + const __llvm_gcov_init_func_struct *InitFuncEnd = + __llvm_profile_end_covinit(); + + for (const __llvm_gcov_init_func_struct *Ptr = InitFuncStart; + Ptr != InitFuncEnd; ++Ptr) { + fn_ptr wfn = (fn_ptr)Ptr->WriteoutFunction; + fn_ptr rfn = (fn_ptr)Ptr->ResetFunction; + if (!(wfn && rfn)) + continue; + llvm_gcov_init(wfn, rfn); + } +} +#endif + void __gcov_dump(void) { for (struct fn_node *f = writeout_fn_list.head; f; f = f->next) f->fn(); diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 9e43fd7c4789d8199430842811d08cfc54b3724d..7f0c0c194dc91950832320aa4fefe1dbd199c0de 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -54,6 +54,12 @@ typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData { #include "profile/InstrProfData.inc" } VTableProfData; +typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) + __llvm_gcov_init_func_struct { +#define COVINIT_FUNC(Type, LLVMType, Name, Initializer) Type Name; +#include "profile/InstrProfData.inc" +} __llvm_gcov_init_func_struct; + /*! * \brief Return 1 if profile counters are continuously synced to the raw * profile via an mmap(). This is in contrast to the default mode, in which @@ -208,6 +214,9 @@ void __llvm_profile_initialize_file(void); /*! \brief Initialize the profile runtime. */ void __llvm_profile_initialize(void); +/*! \brief Initialize the gcov profile runtime. */ +void __llvm_profile_gcov_initialize(void); + /*! * \brief Return path prefix (excluding the base filename) of the profile data. * This is useful for users using \c -fprofile-generate=./path_prefix who do @@ -324,4 +333,6 @@ COMPILER_RT_VISIBILITY extern uint64_t */ extern char INSTR_PROF_PROFILE_NAME_VAR[1]; /* __llvm_profile_filename. */ +const __llvm_gcov_init_func_struct *__llvm_profile_begin_covinit(); +const __llvm_gcov_init_func_struct *__llvm_profile_end_covinit(); #endif /* PROFILE_INSTRPROFILING_H_ */ diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c index c0706b73e16687e209f1046597dbcc7319a20ed5..7cf1679811eb0701fede961784283382d8ccf1c0 100644 --- a/compiler-rt/lib/profile/InstrProfilingMerge.c +++ b/compiler-rt/lib/profile/InstrProfilingMerge.c @@ -154,7 +154,8 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData, SrcCountersStart = (char *)SrcDataEnd; SrcCountersEnd = SrcCountersStart + Header->NumCounters * __llvm_profile_counter_entry_size(); - SrcBitmapStart = SrcCountersEnd; + SrcBitmapStart = SrcCountersEnd + __llvm_profile_get_num_padding_bytes( + SrcCountersEnd - SrcCountersStart); SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes; SrcValueProfDataStart = SrcNameStart + getDistanceFromCounterToValueProf(Header); diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c index b9d51b698b414fc820ba512300ca52417dc74554..651f8785d0b940b896732851becec034fd271efc 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c @@ -202,6 +202,8 @@ static int dummy_vname[0] COMPILER_RT_SECTION( COMPILER_RT_SEG INSTR_PROF_VNAME_SECT_NAME); static int dummy_vtab[0] COMPILER_RT_SECTION( COMPILER_RT_SEG INSTR_PROF_VTAB_SECT_NAME); +static int dummy_covinit_funcs[0] COMPILER_RT_SECTION( + COMPILER_RT_SEG INSTR_PROF_COVINIT_SECT_NAME); // To avoid GC'ing of the dummy variables by the linker, reference them in an // array and reference the array in the runtime registration code @@ -214,7 +216,8 @@ COMPILER_RT_VISIBILITY void *__llvm_profile_keep[] = {(void *)&dummy_cnts, (void *)&dummy_bits, (void *)&dummy_data, (void *)&dummy_name, (void *)&dummy_vnds, (void *)&dummy_orderfile, - (void *)&dummy_vname, (void *)&dummy_vtab}; + (void *)&dummy_vname, (void *)&dummy_vtab, + (void *)&dummy_covinit_funcs}; #ifdef __GNUC__ #pragma GCC diagnostic pop #endif diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c index 02f23379ce98bfeacd2f408e3435fd79c30f1e49..e2c06d51e0c67cb4fc2b18047bc7dda698be9b61 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c @@ -35,6 +35,8 @@ #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON) #define PROF_VNODES_START INSTR_PROF_SECT_START(INSTR_PROF_VNODES_COMMON) #define PROF_VNODES_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNODES_COMMON) +#define PROF_COVINIT_START INSTR_PROF_SECT_START(INSTR_PROF_COVINIT_COMMON) +#define PROF_COVINIT_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_COVINIT_COMMON) /* Declare section start and stop symbols for various sections * generated by compiler instrumentation. @@ -56,6 +58,10 @@ extern char PROF_NAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_NAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern ValueProfNode PROF_VNODES_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern ValueProfNode PROF_VNODES_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern __llvm_gcov_init_func_struct PROF_COVINIT_START COMPILER_RT_VISIBILITY + COMPILER_RT_WEAK; +extern __llvm_gcov_init_func_struct PROF_COVINIT_STOP COMPILER_RT_VISIBILITY + COMPILER_RT_WEAK; COMPILER_RT_VISIBILITY const __llvm_profile_data * __llvm_profile_begin_data(void) { @@ -110,6 +116,16 @@ COMPILER_RT_VISIBILITY ValueProfNode *__llvm_profile_end_vnodes(void) { COMPILER_RT_VISIBILITY ValueProfNode *CurrentVNode = &PROF_VNODES_START; COMPILER_RT_VISIBILITY ValueProfNode *EndVNode = &PROF_VNODES_STOP; +COMPILER_RT_VISIBILITY const __llvm_gcov_init_func_struct * +__llvm_profile_begin_covinit() { + return &PROF_COVINIT_START; +} + +COMPILER_RT_VISIBILITY const __llvm_gcov_init_func_struct * +__llvm_profile_end_covinit() { + return &PROF_COVINIT_STOP; +} + #ifdef NT_GNU_BUILD_ID static size_t RoundUp(size_t size, size_t align) { return (size + align - 1) & ~(align - 1); diff --git a/compiler-rt/lib/rtsan/rtsan_assertions.h b/compiler-rt/lib/rtsan/rtsan_assertions.h index 8183a8202478ffcb99735e753d99b41526d7f18c..28a272b646237290b5abb343a68eb9721b72329c 100644 --- a/compiler-rt/lib/rtsan/rtsan_assertions.h +++ b/compiler-rt/lib/rtsan/rtsan_assertions.h @@ -15,6 +15,7 @@ #include "rtsan/rtsan.h" #include "rtsan/rtsan_context.h" #include "rtsan/rtsan_diagnostics.h" +#include "rtsan/rtsan_stats.h" #include "rtsan/rtsan_suppressions.h" #include "sanitizer_common/sanitizer_stacktrace.h" @@ -28,6 +29,11 @@ void ExpectNotRealtime(Context &context, const DiagnosticsInfo &info, if (context.InRealtimeContext() && !context.IsBypassed()) { ScopedBypass sb{context}; + if (IsFunctionSuppressed(info.func_name)) { + IncrementSuppressedCount(); + return; + } + __sanitizer::BufferedStackTrace stack; // We use the unwind_on_fatal flag here because of precedent with other @@ -35,8 +41,10 @@ void ExpectNotRealtime(Context &context, const DiagnosticsInfo &info, stack.Unwind(info.pc, info.bp, nullptr, __sanitizer::common_flags()->fast_unwind_on_fatal); - if (IsStackTraceSuppressed(stack)) + if (IsStackTraceSuppressed(stack)) { + IncrementSuppressedCount(); return; + } OnViolation(stack, info); } diff --git a/compiler-rt/lib/rtsan/rtsan_checks.inc b/compiler-rt/lib/rtsan/rtsan_checks.inc index f5f23e044bd5d7d74da84d9cce8619fbba0032fb..676b6a5791941eef8d5292345032bee147c44a88 100644 --- a/compiler-rt/lib/rtsan/rtsan_checks.inc +++ b/compiler-rt/lib/rtsan/rtsan_checks.inc @@ -17,3 +17,4 @@ // SummaryKind should be a string literal. RTSAN_CHECK(CallStackContains, "call-stack-contains") +RTSAN_CHECK(FunctionNameMatches, "function-name-matches") diff --git a/compiler-rt/lib/rtsan/rtsan_flags.h b/compiler-rt/lib/rtsan/rtsan_flags.h index 29025c29b6fc2a76f95e4233779e7fcb29ca0a38..f46e04933fa5286fc2b6728cc8b4b7566143cda9 100644 --- a/compiler-rt/lib/rtsan/rtsan_flags.h +++ b/compiler-rt/lib/rtsan/rtsan_flags.h @@ -18,6 +18,8 @@ struct Flags { Type Name{DefaultValue}; #include "rtsan_flags.inc" #undef RTSAN_FLAG + + bool ContainsSuppresionFile() { return suppressions[0] != '\0'; } }; extern Flags flags_data; diff --git a/compiler-rt/lib/rtsan/rtsan_stats.cpp b/compiler-rt/lib/rtsan/rtsan_stats.cpp index dac7b23c3ef520714998b734ff15108888d617d0..277182a7abc84047919d47e39671977395a238ed 100644 --- a/compiler-rt/lib/rtsan/rtsan_stats.cpp +++ b/compiler-rt/lib/rtsan/rtsan_stats.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "rtsan/rtsan_stats.h" +#include "rtsan/rtsan_flags.h" #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" @@ -18,23 +19,32 @@ using namespace __sanitizer; using namespace __rtsan; -static atomic_uint32_t rtsan_total_error_count{0}; -static atomic_uint32_t rtsan_unique_error_count{0}; +static atomic_uint32_t total_error_count{0}; +static atomic_uint32_t unique_error_count{0}; +static atomic_uint32_t suppressed_count{0}; void __rtsan::IncrementTotalErrorCount() { - atomic_fetch_add(&rtsan_total_error_count, 1, memory_order_relaxed); + atomic_fetch_add(&total_error_count, 1, memory_order_relaxed); } void __rtsan::IncrementUniqueErrorCount() { - atomic_fetch_add(&rtsan_unique_error_count, 1, memory_order_relaxed); + atomic_fetch_add(&unique_error_count, 1, memory_order_relaxed); } static u32 GetTotalErrorCount() { - return atomic_load(&rtsan_total_error_count, memory_order_relaxed); + return atomic_load(&total_error_count, memory_order_relaxed); } static u32 GetUniqueErrorCount() { - return atomic_load(&rtsan_unique_error_count, memory_order_relaxed); + return atomic_load(&unique_error_count, memory_order_relaxed); +} + +void __rtsan::IncrementSuppressedCount() { + atomic_fetch_add(&suppressed_count, 1, memory_order_relaxed); +} + +static u32 GetSuppressedCount() { + return atomic_load(&suppressed_count, memory_order_relaxed); } void __rtsan::PrintStatisticsSummary() { @@ -42,4 +52,7 @@ void __rtsan::PrintStatisticsSummary() { Printf("RealtimeSanitizer exit stats:\n"); Printf(" Total error count: %u\n", GetTotalErrorCount()); Printf(" Unique error count: %u\n", GetUniqueErrorCount()); + + if (flags().ContainsSuppresionFile()) + Printf(" Suppression count: %u\n", GetSuppressedCount()); } diff --git a/compiler-rt/lib/rtsan/rtsan_stats.h b/compiler-rt/lib/rtsan/rtsan_stats.h index a72098792c89c9d7b9c7c94d6116226f5098c7ec..a8a67ea2a44b6d63e8441e70b180b6ac9e1928cd 100644 --- a/compiler-rt/lib/rtsan/rtsan_stats.h +++ b/compiler-rt/lib/rtsan/rtsan_stats.h @@ -16,6 +16,7 @@ namespace __rtsan { void IncrementTotalErrorCount(); void IncrementUniqueErrorCount(); +void IncrementSuppressedCount(); void PrintStatisticsSummary(); diff --git a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp index c5051dd19102362da80f859f3b2965921529322b..2bcfbeed4195bca04923232009985a8b6ac95651 100644 --- a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp +++ b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp @@ -56,7 +56,7 @@ void __rtsan::InitializeSuppressions() { CHECK_EQ(nullptr, suppression_ctx); // We will use suppression_ctx == nullptr as an early out - if (flags().suppressions[0] == '\0') + if (!flags().ContainsSuppresionFile()) return; suppression_ctx = new (suppression_placeholder) @@ -92,3 +92,16 @@ bool __rtsan::IsStackTraceSuppressed(const StackTrace &stack) { } return false; } + +bool __rtsan::IsFunctionSuppressed(const char *function_name) { + if (suppression_ctx == nullptr) + return false; + + const char *flag_name = ConvertTypeToFlagName(ErrorType::FunctionNameMatches); + + if (!suppression_ctx->HasSuppressionType(flag_name)) + return false; + + Suppression *s; + return suppression_ctx->Match(function_name, flag_name, &s); +} diff --git a/compiler-rt/lib/rtsan/rtsan_suppressions.h b/compiler-rt/lib/rtsan/rtsan_suppressions.h index 45545f8c0e0b65c71817ca52960616cfada7e57a..9990b99f3b52cd56974acb21b5e8c8c3bc31d5d6 100644 --- a/compiler-rt/lib/rtsan/rtsan_suppressions.h +++ b/compiler-rt/lib/rtsan/rtsan_suppressions.h @@ -18,5 +18,6 @@ namespace __rtsan { void InitializeSuppressions(); bool IsStackTraceSuppressed(const __sanitizer::StackTrace &stack); +bool IsFunctionSuppressed(const char *function_name); } // namespace __rtsan diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index 082d2158e579bd6a100741284641a54f9ec26bb7..0b5e68c5fd7978779215a3a73035fecedb043d29 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -268,7 +268,15 @@ class ScopedErrorReportLock { extern uptr stoptheworld_tracer_pid; extern uptr stoptheworld_tracer_ppid; +// Returns true if the entire range can be read. bool IsAccessibleMemoryRange(uptr beg, uptr size); +// Attempts to copy `n` bytes from memory range starting at `src` to `dest`. +// Returns true if the entire range can be read. Returns `false` if any part of +// the source range cannot be read, in which case the contents of `dest` are +// undefined. +bool TryMemCpy(void *dest, const void *src, uptr n); +// Copies accessible memory, and zero fill inaccessible. +void MemCpyAccessible(void *dest, const void *src, uptr n); // Error report formatting. const char *StripPathPrefix(const char *filepath, diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp index 684720963a8dcb6836d8f9c7bad33df7904b438a..f275e81ff04169d0d0587c9531bb56ada0df8496 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp @@ -219,6 +219,32 @@ static void StopStackDepotBackgroundThread() { static void StopStackDepotBackgroundThread() {} #endif +void MemCpyAccessible(void *dest, const void *src, uptr n) { + if (TryMemCpy(dest, src, n)) + return; + + const uptr page_size = GetPageSize(); + uptr b = reinterpret_cast(src); + uptr b_up = RoundUpTo(b, page_size); + + uptr e = reinterpret_cast(src) + n; + uptr e_down = RoundDownTo(e, page_size); + + auto copy_or_zero = [dest, src](uptr beg, uptr end) { + const uptr udest = reinterpret_cast(dest); + const uptr usrc = reinterpret_cast(src); + void *d = reinterpret_cast(udest + (beg - usrc)); + const uptr size = end - beg; + if (!TryMemCpy(d, reinterpret_cast(beg), size)) + internal_memset(d, 0, size); + }; + + copy_or_zero(b, b_up); + for (uptr p = b_up; p < e_down; p += page_size) + copy_or_zero(p, p + page_size); + copy_or_zero(e_down, e); +} + } // namespace __sanitizer SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_sandbox_on_notify, diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp index 75dcf546729f6ea1124ff078a880408246fd08d7..c2ace46c946587c1cca59557f7a5e9bd47a0a976 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp @@ -444,6 +444,11 @@ bool IsAccessibleMemoryRange(uptr beg, uptr size) { return status == ZX_OK; } +bool TryMemCpy(void *dest, const void *src, uptr n) { + // TODO: implement. + return false; +} + // FIXME implement on this platform. void GetMemoryProfile(fill_profile_f cb, uptr *stats) {} diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp index 9ffb36f812c45d178202b3edd8d1ec12e382250e..7ee2319456d23ef22c0248dc04b0afb7db5338eb 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp @@ -288,26 +288,86 @@ bool SignalContext::IsStackOverflow() const { #endif // SANITIZER_GO +static void SetNonBlock(int fd) { + int res = fcntl(fd, F_GETFL, 0); + CHECK(!internal_iserror(res, nullptr)); + + res |= O_NONBLOCK; + res = fcntl(fd, F_SETFL, res); + CHECK(!internal_iserror(res, nullptr)); +} + bool IsAccessibleMemoryRange(uptr beg, uptr size) { - uptr page_size = GetPageSizeCached(); - // Checking too large memory ranges is slow. - CHECK_LT(size, page_size * 10); - int sock_pair[2]; - if (pipe(sock_pair)) - return false; - uptr bytes_written = - internal_write(sock_pair[1], reinterpret_cast(beg), size); - int write_errno; - bool result; - if (internal_iserror(bytes_written, &write_errno)) { - CHECK_EQ(EFAULT, write_errno); - result = false; - } else { - result = (bytes_written == size); + while (size) { + // `read` from `fds[0]` into a dummy buffer to free up the pipe buffer for + // more `write` is slower than just recreating a pipe. + int fds[2]; + CHECK_EQ(0, pipe(fds)); + + auto cleanup = at_scope_exit([&]() { + internal_close(fds[0]); + internal_close(fds[1]); + }); + + SetNonBlock(fds[1]); + + int write_errno; + uptr w = internal_write(fds[1], reinterpret_cast(beg), size); + if (internal_iserror(w, &write_errno)) { + if (write_errno == EINTR) + continue; + CHECK_EQ(EFAULT, write_errno); + return false; + } + size -= w; + beg += w; + } + + return true; +} + +bool TryMemCpy(void *dest, const void *src, uptr n) { + if (!n) + return true; + int fds[2]; + CHECK_EQ(0, pipe(fds)); + + auto cleanup = at_scope_exit([&]() { + internal_close(fds[0]); + internal_close(fds[1]); + }); + + SetNonBlock(fds[0]); + SetNonBlock(fds[1]); + + char *d = static_cast(dest); + const char *s = static_cast(src); + + while (n) { + int e; + uptr w = internal_write(fds[1], s, n); + if (internal_iserror(w, &e)) { + if (e == EINTR) + continue; + CHECK_EQ(EFAULT, e); + return false; + } + s += w; + n -= w; + + while (w) { + uptr r = internal_read(fds[0], d, w); + if (internal_iserror(r, &e)) { + CHECK_EQ(EINTR, e); + continue; + } + + d += r; + w -= r; + } } - internal_close(sock_pair[0]); - internal_close(sock_pair[1]); - return result; + + return true; } void PlatformPrepareForSandboxing(void *args) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp index 6fb947aa6d6c26f98ad15438ac9cdfceb4b06e31..ea513d5f263fe2786034b5be02be93c5334b7aaa 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp @@ -968,6 +968,11 @@ bool IsAccessibleMemoryRange(uptr beg, uptr size) { return true; } +bool TryMemCpy(void *dest, const void *src, uptr n) { + // TODO: implement. + return false; +} + bool SignalContext::IsStackOverflow() const { return (DWORD)GetType() == EXCEPTION_STACK_OVERFLOW; } diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp index a557c4645ba0c5d78a66be4feea6a97c3c4a1fbc..61171019a5706d0d885add8eb965a47148f61479 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp @@ -31,7 +31,7 @@ TEST(SanitizerCommon, ChainedOriginDepotBasic) { TEST(SanitizerCommon, ChainedOriginDepotAbsent) { u32 prev_id; - EXPECT_EQ(0U, chainedOriginDepot.Get(99, &prev_id)); + EXPECT_EQ(0U, chainedOriginDepot.Get(123456, &prev_id)); EXPECT_EQ(0U, prev_id); } diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp index ce4a40444cd4964d1becd201765aca44ce4d1ad5..70669ab81691b835801ede1ca4a008d7df7bf6ae 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp @@ -205,7 +205,6 @@ TEST(SanitizerLinux, ThreadDescriptorSize) { void *result; ASSERT_EQ(0, pthread_create(&tid, 0, thread_descriptor_size_test_func, 0)); ASSERT_EQ(0, pthread_join(tid, &result)); - EXPECT_EQ(0u, ThreadDescriptorSize()); InitTlsSize(); EXPECT_EQ((uptr)result, ThreadDescriptorSize()); } diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp index be577c3364049cb22614e6664b40fc856a3c5002..5016b09c15307fd5cbafc66fbeec7860eb58a983 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp @@ -13,11 +13,14 @@ #include "sanitizer_common/sanitizer_platform.h" #if SANITIZER_POSIX -#include "sanitizer_common/sanitizer_common.h" -#include "gtest/gtest.h" +# include +# include -#include -#include +# include +# include + +# include "gtest/gtest.h" +# include "sanitizer_common/sanitizer_common.h" namespace __sanitizer { @@ -65,8 +68,8 @@ TEST(SanitizerCommon, PthreadDestructorIterations) { TEST(SanitizerCommon, IsAccessibleMemoryRange) { const int page_size = GetPageSize(); - uptr mem = (uptr)mmap(0, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); + InternalMmapVector buffer(3 * page_size); + uptr mem = reinterpret_cast(buffer.data()); // Protect the middle page. mprotect((void *)(mem + page_size), page_size, PROT_NONE); EXPECT_TRUE(IsAccessibleMemoryRange(mem, page_size - 1)); @@ -80,6 +83,87 @@ TEST(SanitizerCommon, IsAccessibleMemoryRange) { EXPECT_FALSE(IsAccessibleMemoryRange(0x0, 2)); } +TEST(SanitizerCommon, IsAccessibleMemoryRangeLarge) { + InternalMmapVector buffer(10000 * GetPageSize()); + EXPECT_TRUE(IsAccessibleMemoryRange(reinterpret_cast(buffer.data()), + buffer.size())); +} + +TEST(SanitizerCommon, TryMemCpy) { + std::vector src(10000000); + std::iota(src.begin(), src.end(), 123); + std::vector dst; + + // Don't use ::testing::ElementsAreArray or similar, as the huge output on an + // error is not helpful. + + dst.assign(1, 0); + EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size())); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin())); + + dst.assign(100, 0); + EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size())); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin())); + + dst.assign(534, 0); + EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size())); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin())); + + dst.assign(GetPageSize(), 0); + EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size())); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin())); + + dst.assign(src.size(), 0); + EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size())); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin())); + + dst.assign(src.size() - 1, 0); + EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size())); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin())); +} + +TEST(SanitizerCommon, TryMemCpyNull) { + std::vector dst(100); + EXPECT_FALSE(TryMemCpy(dst.data(), nullptr, dst.size())); +} + +TEST(SanitizerCommon, MemCpyAccessible) { + const int page_num = 1000; + const int page_size = GetPageSize(); + InternalMmapVector src(page_num * page_size); + std::iota(src.begin(), src.end(), 123); + std::vector dst; + std::vector exp = {src.begin(), src.end()}; + + // Protect some pages. + for (int i = 7; i < page_num; i *= 2) { + mprotect(src.data() + i * page_size, page_size, PROT_NONE); + std::fill(exp.data() + i * page_size, exp.data() + (i + 1) * page_size, 0); + } + + dst.assign(src.size(), 0); + EXPECT_FALSE(TryMemCpy(dst.data(), src.data(), dst.size())); + + // Full page aligned range with mprotect pages. + dst.assign(src.size(), 0); + MemCpyAccessible(dst.data(), src.data(), dst.size()); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), exp.begin())); + + // Misaligned range with mprotect pages. + size_t offb = 3; + size_t offe = 7; + dst.assign(src.size() - offb - offe, 0); + MemCpyAccessible(dst.data(), src.data() + offb, dst.size()); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), exp.begin() + offb)); + + // Misaligned range with ends in mprotect pages. + offb = 3 + 7 * page_size; + offe = 7 + 14 * page_size; + dst.assign(src.size() - offb - offe, 0); + MemCpyAccessible(dst.data(), src.data() + offb, dst.size()); + EXPECT_TRUE(std::equal(dst.begin(), dst.end(), exp.begin() + offb)); +} + } // namespace __sanitizer #endif // SANITIZER_POSIX diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 323a8b9d76c994bdf87b8a33908d6a7dbf7565cf..5deb8c97f1c86e1ce1a704c66bce266e6d6ca82c 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -1255,22 +1255,26 @@ private: else Header->State = Chunk::State::Quarantined; - void *BlockBegin; - if (LIKELY(!useMemoryTagging(Options))) { + if (LIKELY(!useMemoryTagging(Options))) Header->OriginOrWasZeroed = 0U; - if (BypassQuarantine && allocatorSupportsMemoryTagging()) - Ptr = untagPointer(Ptr); - BlockBegin = getBlockBegin(Ptr, Header); - } else { + else { Header->OriginOrWasZeroed = Header->ClassId && !TSDRegistry.getDisableMemInit(); - BlockBegin = - retagBlock(Options, TaggedPtr, Ptr, Header, Size, BypassQuarantine); } Chunk::storeHeader(Cookie, Ptr, Header); if (BypassQuarantine) { + void *BlockBegin; + if (LIKELY(!useMemoryTagging(Options))) { + // Must do this after storeHeader because loadHeader uses a tagged ptr. + if (allocatorSupportsMemoryTagging()) + Ptr = untagPointer(Ptr); + BlockBegin = getBlockBegin(Ptr, Header); + } else { + BlockBegin = retagBlock(Options, TaggedPtr, Ptr, Header, Size, true); + } + const uptr ClassId = Header->ClassId; if (LIKELY(ClassId)) { bool CacheDrained; @@ -1288,6 +1292,8 @@ private: Secondary.deallocate(Options, BlockBegin); } } else { + if (UNLIKELY(useMemoryTagging(Options))) + retagBlock(Options, TaggedPtr, Ptr, Header, Size, false); typename TSDRegistryT::ScopedTSD TSD(TSDRegistry); Quarantine.put(&TSD->getQuarantineCache(), QuarantineCallback(*this, TSD->getCache()), Ptr, Size); diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 16b19e807e11bc2b07afe815b90f362c549ab424..ff98eb3397ee0e305696737f0a70fa6d53e173d3 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -534,6 +534,27 @@ SCUDO_TYPED_TEST(ScudoCombinedDeathTest, UseAfterFree) { } } +SCUDO_TYPED_TEST(ScudoCombinedDeathTest, DoubleFreeFromPrimary) { + auto *Allocator = this->Allocator.get(); + + for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) { + const scudo::uptr Size = 1U << SizeLog; + if (!isPrimaryAllocation>(Size, 0)) + break; + + // Verify that a double free results in a chunk state error. + EXPECT_DEATH( + { + // Allocate from primary + void *P = Allocator->allocate(Size, Origin); + ASSERT_TRUE(P != nullptr); + Allocator->deallocate(P, Origin); + Allocator->deallocate(P, Origin); + }, + "invalid chunk state"); + } +} + SCUDO_TYPED_TEST(ScudoCombinedDeathTest, DisableMemoryTagging) { auto *Allocator = this->Allocator.get(); diff --git a/compiler-rt/test/lsan/TestCases/print_threads.c b/compiler-rt/test/lsan/TestCases/print_threads.c index b3072da93fab621259ebf4737fa378e7490f2e39..a9389412af1ccfaa59b993cb64417929eaa47cc1 100644 --- a/compiler-rt/test/lsan/TestCases/print_threads.c +++ b/compiler-rt/test/lsan/TestCases/print_threads.c @@ -2,6 +2,9 @@ // XFAIL: hwasan +// No pthread barriers on Darwin. +// UNSUPPORTED: darwin + #include #include #include diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/Inputs/EmptyClassFoo.m b/compiler-rt/test/orc/TestCases/Darwin/Generic/Inputs/EmptyClassFoo.m new file mode 100644 index 0000000000000000000000000000000000000000..12e3dea5af3784d7a9dff69f5acc2bd9b2de5e4d --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/Inputs/EmptyClassFoo.m @@ -0,0 +1,7 @@ +#include + +@interface Foo : NSObject +@end + +@implementation Foo +@end diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/llvm-jitlink-force-link-objc.m b/compiler-rt/test/orc/TestCases/Darwin/Generic/llvm-jitlink-force-link-objc.m new file mode 100644 index 0000000000000000000000000000000000000000..89cd6bd01671c5ee135f957cb4e61f1ca5d492da --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/llvm-jitlink-force-link-objc.m @@ -0,0 +1,14 @@ +// RUN: rm -rf %t && mkdir -p %t +// RUN: %clang -c -o %t/EmptyClassFoo.o %S/Inputs/EmptyClassFoo.m +// RUN: ar r %t/libFooClass.a %t/EmptyClassFoo.o +// RUN: %clang -c -o %t/force-objc.o %s +// RUN: %llvm_jitlink -ObjC %t/force-objc.o -L%t -lFooClass +// +// REQUIRES: system-darwin && host-arch-compatible + +id objc_getClass(const char *name); + +int main(int argc, char *argv[]) { + // Return succeess if we find Foo, error otherwise. + return objc_getClass("Foo") ? 0 : 1; +} diff --git a/compiler-rt/test/orc/TestCases/Linux/Generic/Inputs/SetGlobalIntXInConstructor.cpp b/compiler-rt/test/orc/TestCases/Linux/Generic/Inputs/SetGlobalIntXInConstructor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8c67a34fe243f77034360b1f2973bc22fa3115af --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Linux/Generic/Inputs/SetGlobalIntXInConstructor.cpp @@ -0,0 +1,12 @@ +extern "C" int x; + +namespace { + +struct Init { +public: + Init() { x = 1; } +}; + +Init SetX; + +} // namespace diff --git a/compiler-rt/test/orc/TestCases/Linux/Generic/llvm-jitlink-all-load.c b/compiler-rt/test/orc/TestCases/Linux/Generic/llvm-jitlink-all-load.c new file mode 100644 index 0000000000000000000000000000000000000000..dde7dacd92b32e4f9d49f0a7908a7afbbc245833 --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Linux/Generic/llvm-jitlink-all-load.c @@ -0,0 +1,14 @@ +// Check that the -all_load flag to llvm-jitlink causes all objects from +// archives to be loaded, regardless of whether or not they're referenced. +// +// RUN: rm -rf %t && mkdir -p %t +// RUN: %clangxx -c -o %t/SetX.o %S/Inputs/SetGlobalIntXInConstructor.cpp +// RUN: ar r %t/libSetX.a %t/SetX.o +// RUN: %clang -c -o %t/all_load.o %s +// RUN: %llvm_jitlink -all_load %t/all_load.o -L%t -lSetX +// +// REQUIRES: system-darwin && host-arch-compatible + +int x = 0; + +int main(int argc, char *argv[]) { return x == 1 ? 0 : 1; } diff --git a/compiler-rt/test/orc/lit.cfg.py b/compiler-rt/test/orc/lit.cfg.py index 897cefb3d1930dea7542d4e2d5d7691863ffffdb..6dfa94b11cc9d942561e28fa6ba7590a6f5f9b0f 100644 --- a/compiler-rt/test/orc/lit.cfg.py +++ b/compiler-rt/test/orc/lit.cfg.py @@ -14,6 +14,8 @@ host_arch_compatible = config.target_arch == config.host_arch if config.host_arch == "x86_64h" and config.target_arch == "x86_64": host_arch_compatible = True +if host_arch_compatible: + config.available_features.add("host-arch-compatible") config.test_target_is_host_executable = ( config.target_os == config.host_os and host_arch_compatible ) @@ -71,9 +73,10 @@ config.substitutions.append( (lli + " -jit-kind=orc -jit-linker=jitlink -orc-runtime=" + orc_rt_path), ) ) +config.substitutions.append(("%ar", "ar")) # Default test suffixes. -config.suffixes = [".c", ".cpp", ".S", ".ll", ".test"] +config.suffixes = [".c", ".cpp", ".m", ".S", ".ll", ".test"] # Exclude Inputs directories. config.excludes = ["Inputs"] diff --git a/compiler-rt/test/profile/AIX/gcov-undef-sym.test b/compiler-rt/test/profile/AIX/gcov-undef-sym.test new file mode 100644 index 0000000000000000000000000000000000000000..db9053952d95b7022016acdfdec08d4b776f4fad --- /dev/null +++ b/compiler-rt/test/profile/AIX/gcov-undef-sym.test @@ -0,0 +1,52 @@ +// The undefined symbol should not cause link errors, and we should +// obtain the expected coverage report. + +// Test the --coverage option. +RUN: rm -rf %t0 && split-file %s %t0 && cd %t0 +RUN: %clang bar.c main.c undef.c --coverage -c +RUN: ar -X32_64 -rv libfoo.a undef.o bar.o +RUN: %clang main.o -L. -lfoo --coverage -o main.exe +RUN: %run ./main.exe +RUN: llvm-cov gcov -t main.gcda | FileCheck --check-prefix=MAIN %s +RUN: llvm-cov gcov -t bar.gcda | FileCheck --check-prefix=BAR %s + +// Test the pgogen -fprofile-arcs -ftest-coverage option combination. +RUN: rm -rf %t1 && split-file %s %t1 && cd %t1 +RUN: %clang_pgogen bar.c main.c undef.c -fprofile-arcs -ftest-coverage -c +RUN: ar -X32_64 -rv libfoo.a undef.o bar.o +RUN: %clang_pgogen main.o -L. -lfoo -fprofile-generate -fprofile-arcs -ftest-coverage -o main.exe +RUN: %run ./main.exe +RUN: llvm-cov gcov -t main.gcda | FileCheck --check-prefix=MAIN %s +RUN: llvm-cov gcov -t bar.gcda | FileCheck --check-prefix=BAR %s + +// Test the pgogen -Wl,-bcdtors:mbr option combination. +RUN: rm -rf %t2 && split-file %s %t2 && cd %t2 +RUN: %clang_pgogen bar.c main.c undef.c -fprofile-arcs -ftest-coverage -c +RUN: ar -X32_64 -rv libfoo.a undef.o bar.o +RUN: %clang_pgogen main.o -L. -lfoo -fprofile-generate -fprofile-arcs -ftest-coverage -Wl,-bcdtors:mbr -o main.exe +RUN: %run ./main.exe +RUN: llvm-cov gcov -t main.gcda | FileCheck --check-prefix=MAIN %s +RUN: llvm-cov gcov -t bar.gcda | FileCheck --check-prefix=BAR %s + +MAIN: 1: 2:int main() { +MAIN: 1: 3: return bar(); +BAR: 1: 1:int bar() { +BAR: 1: 2: return 0; + +//--- main.c +int bar(); +int main() { + return bar(); +} + + +//--- bar.c +int bar() { + return 0; +} + +//--- undef.c +void undef_func(); +void foo() { + undef_func(); +} diff --git a/compiler-rt/test/profile/Posix/instrprof-visibility.cpp b/compiler-rt/test/profile/Posix/instrprof-visibility.cpp index bb533050e0592634998b52385ef16157227032cf..016aaed57e151b959bddf2b3a9d00bfd3e9640bd 100644 --- a/compiler-rt/test/profile/Posix/instrprof-visibility.cpp +++ b/compiler-rt/test/profile/Posix/instrprof-visibility.cpp @@ -1,3 +1,4 @@ +// XFAIL: target={{.*}}-aix{{.*}} // RUN: %clangxx_profgen -fcoverage-mapping %S/Inputs/instrprof-visibility-helper.cpp -o %t %s // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t // RUN: llvm-profdata merge %t.profraw -o %t.profdata diff --git a/compiler-rt/test/profile/coverage-inline.cpp b/compiler-rt/test/profile/coverage-inline.cpp index e362e566fb4b416e7c72006be80cd214634bb361..a4114363007a3e5da5e86e1e48f9aca25d6e4041 100644 --- a/compiler-rt/test/profile/coverage-inline.cpp +++ b/compiler-rt/test/profile/coverage-inline.cpp @@ -1,3 +1,4 @@ +// XFAIL: target={{.*}}-aix{{.*}} // Test that the instrumentation puts the right linkage on the profile data for // inline functions. // RUN: %clang_profgen -g -fcoverage-mapping -c -o %t1.o %s -DOBJECT_1 diff --git a/compiler-rt/test/profile/coverage_comments.cpp b/compiler-rt/test/profile/coverage_comments.cpp index d206fb608792090d85ef9e29e6362046c6d9ffbc..8a99d646f5818d7bbe2cf891dc88a15169557543 100644 --- a/compiler-rt/test/profile/coverage_comments.cpp +++ b/compiler-rt/test/profile/coverage_comments.cpp @@ -1,3 +1,4 @@ +// XFAIL: target={{.*}}-aix{{.*}} // RUN: %clangxx_profgen -fcoverage-mapping -Wno-comment -o %t %s // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t // RUN: llvm-profdata merge -o %t.profdata %t.profraw diff --git a/compiler-rt/test/profile/coverage_emptylines.cpp b/compiler-rt/test/profile/coverage_emptylines.cpp index 8610d70f3e1b7a1afb7bf8add3021385e27f3bc5..8006cdee6ec1923cfb8efd30d1fd5efc24ddaa78 100644 --- a/compiler-rt/test/profile/coverage_emptylines.cpp +++ b/compiler-rt/test/profile/coverage_emptylines.cpp @@ -1,3 +1,4 @@ +// XFAIL: target={{.*}}-aix{{.*}} // Remove comments first. // RUN: sed 's/[ \t]*\/\/.*//' %s > %t.stripped.cpp // RUN: %clangxx_profgen -fcoverage-mapping -o %t %t.stripped.cpp diff --git a/compiler-rt/test/profile/instrprof-merging.cpp b/compiler-rt/test/profile/instrprof-merging.cpp index 6212feb19c2a838a4b39c8f31a34d61b2fc8f748..4a3f14b044a5c91d35c4038a94edd0b4164e1ec5 100644 --- a/compiler-rt/test/profile/instrprof-merging.cpp +++ b/compiler-rt/test/profile/instrprof-merging.cpp @@ -1,4 +1,5 @@ // UNSUPPORTED: target={{.*windows.*}} +// XFAIL: target={{.*}}-aix{{.*}} // 1) Compile shared code into different object files and into an executable. // RUN: %clangxx_profgen -std=c++14 -fcoverage-mapping %s -c -o %t.v1.o \ diff --git a/compiler-rt/test/profile/instrprof-set-file-object-merging.c b/compiler-rt/test/profile/instrprof-set-file-object-merging.c index 92f5f92e27720f2f4ff1f796ca536eab8dcd3710..baabb21cd672ce0a0e3739980d951e20fadf6849 100644 --- a/compiler-rt/test/profile/instrprof-set-file-object-merging.c +++ b/compiler-rt/test/profile/instrprof-set-file-object-merging.c @@ -24,6 +24,7 @@ int main(int argc, const char *argv[]) { return 0; } +// XFAIL: target={{.*}}-aix{{.*}} // CHECK: 10| |#include // CHECK: 11| | // CHECK: 12| |extern void __llvm_profile_set_file_object(FILE *, int); diff --git a/compiler-rt/test/profile/instrprof-set-file-object.c b/compiler-rt/test/profile/instrprof-set-file-object.c index 280374acb55d309b7e2abab7904c8453d6820e21..0d1f96d5d826ab0a668ba48c894ba5dfc2884620 100644 --- a/compiler-rt/test/profile/instrprof-set-file-object.c +++ b/compiler-rt/test/profile/instrprof-set-file-object.c @@ -17,6 +17,7 @@ int main(int argc, const char *argv[]) { __llvm_profile_set_file_object(F, 0); return 0; } +// XFAIL: target={{.*}}-aix{{.*}} // CHECK: 8| |#include // CHECK: 9| | // CHECK: 10| |extern void __llvm_profile_set_file_object(FILE *, int); diff --git a/compiler-rt/test/profile/instrprof-without-libc.c b/compiler-rt/test/profile/instrprof-without-libc.c index 3142138cdffc0d6db175689309ebd0cff07c2ad7..d0d213b07ba2dadf030bb4521468d76a8235f115 100644 --- a/compiler-rt/test/profile/instrprof-without-libc.c +++ b/compiler-rt/test/profile/instrprof-without-libc.c @@ -1,3 +1,4 @@ +// XFAIL: target={{.*}}-aix{{.*}} // RUN: %clang_profgen -DCHECK_SYMBOLS -O3 -o %t.symbols %s // RUN: llvm-nm %t.symbols | FileCheck %s --check-prefix=CHECK-SYMBOLS // RUN: %clang_profgen -O3 -o %t %s diff --git a/compiler-rt/test/profile/instrprof-write-file-only.c b/compiler-rt/test/profile/instrprof-write-file-only.c index f505cf64a5c760acf70863f3558bb20a72cd4c43..5edad271f8698ad10d1025b5272f34ac990ffaeb 100644 --- a/compiler-rt/test/profile/instrprof-write-file-only.c +++ b/compiler-rt/test/profile/instrprof-write-file-only.c @@ -1,3 +1,4 @@ +// XFAIL: target={{.*}}-aix{{.*}} // RUN: %clang_profgen -o %t -O3 %s // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t // RUN: llvm-profdata merge -o %t.profdata %t.profraw diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py index 3b3019a07c30a5d2843eb4ee59f26e959cca89f6..c8c78a746b4c9b66ebf209e3a9c82b038193fac1 100644 --- a/compiler-rt/test/profile/lit.cfg.py +++ b/compiler-rt/test/profile/lit.cfg.py @@ -77,12 +77,8 @@ def exclude_unsupported_files_for_aix(dirname): f = open(source_path, "r") try: data = f.read() - # -fprofile-instr-generate and rpath are not supported on AIX, exclude all tests with them. - if ( - "%clang_profgen" in data - or "%clangxx_profgen" in data - or "-rpath" in data - ): + # rpath is not supported on AIX, exclude all tests with them. + if ( "-rpath" in data ): config.excludes += [filename] finally: f.close() diff --git a/compiler-rt/test/rtsan/exit_stats.cpp b/compiler-rt/test/rtsan/exit_stats.cpp index d4d19ace778ba5ad78d77d4f8b41922a5d344edb..92ca58f1edde869327424f4e360d8330bf43c978 100644 --- a/compiler-rt/test/rtsan/exit_stats.cpp +++ b/compiler-rt/test/rtsan/exit_stats.cpp @@ -1,6 +1,7 @@ // RUN: %clangxx -fsanitize=realtime %s -o %t // RUN: %env_rtsan_opts="halt_on_error=false,print_stats_on_exit=true" %run %t 2>&1 | FileCheck %s // RUN: %env_rtsan_opts="halt_on_error=true,print_stats_on_exit=true" not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HALT +// RUN: %env_rtsan_opts="suppressions=%s.supp,print_stats_on_exit=true" not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-SUPPRESSIONS // UNSUPPORTED: ios @@ -23,7 +24,17 @@ int main() { // CHECK: RealtimeSanitizer exit stats: // CHECK-NEXT: Total error count: 10 // CHECK-NEXT: Unique error count: 1 +// CHECK-NOT: Suppression count // CHECK-HALT: RealtimeSanitizer exit stats: // CHECK-HALT-NEXT: Total error count: 1 // CHECK-HALT-NEXT: Unique error count: 1 +// CHECK-HALT-NOT: Suppression count + +// We pass in intentionally_non_existant_function in the suppressions file +// This is just to ensure we only get the "Suppression count" metric if this +// file is passed at runtime, otherwise that statistic is omitted +// CHECK-SUPPRESSIONS: RealtimeSanitizer exit stats: +// CHECK-SUPPRESSIONS-NEXT: Total error count: 1 +// CHECK-SUPPRESSIONS-NEXT: Unique error count: 1 +// CHECK-SUPPRESSIONS-NEXT: Suppression count: 0 diff --git a/compiler-rt/test/rtsan/exit_stats.cpp.supp b/compiler-rt/test/rtsan/exit_stats.cpp.supp new file mode 100644 index 0000000000000000000000000000000000000000..b720bdb770808be021d89bab1c70e8e34e24e9c7 --- /dev/null +++ b/compiler-rt/test/rtsan/exit_stats.cpp.supp @@ -0,0 +1 @@ +function-name-matches:intentionally_non_existant_function diff --git a/compiler-rt/test/rtsan/stack_suppressions.cpp b/compiler-rt/test/rtsan/stack_suppressions.cpp index 2aceedbb313b11df282e2f1ea8534cc1d601af93..be1cf4963c7f80fac0f76145bb3ead4210d2fd82 100644 --- a/compiler-rt/test/rtsan/stack_suppressions.cpp +++ b/compiler-rt/test/rtsan/stack_suppressions.cpp @@ -1,5 +1,6 @@ // RUN: %clangxx -fsanitize=realtime %s -o %t -// RUN: %env_rtsan_opts=suppressions='%s.supp' not %run %t 2>&1 | FileCheck %s +// RUN: %env_rtsan_opts=halt_on_error=false %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NOSUPPRESSIONS +// RUN: %env_rtsan_opts=suppressions='%s.supp':print_stats_on_exit=true not %run %t 2>&1 | FileCheck %s // UNSUPPORTED: ios // Intent: Ensure that suppressions work as intended @@ -8,8 +9,11 @@ #include #include +#include #include +std::atomic cas_atomic{0}; + void *MallocViolation() { return malloc(10); } void VectorViolations() { @@ -22,13 +26,18 @@ void VectorViolations() { v.reserve(10); } -void BlockFunc() [[clang::blocking]] { usleep(1); } +void BlockFunc() [[clang::blocking]] { + int expected = 0; + while (!cas_atomic.compare_exchange_weak(expected, 1)) { + expected = cas_atomic.load(); + } +} void *process() [[clang::nonblocking]] { - void *ptr = MallocViolation(); - VectorViolations(); - BlockFunc(); - free(ptr); + void *ptr = MallocViolation(); // Suppressed call-stack-contains + VectorViolations(); // Suppressed call-stack-contains with regex + BlockFunc(); // Suppressed function-name-matches + free(ptr); // Suppressed function-name-matches // This is the one that should abort the program // Everything else is suppressed @@ -51,3 +60,12 @@ int main() { // CHECK-NOT: vector // CHECK-NOT: free // CHECK-NOT: BlockFunc + +// CHECK: RealtimeSanitizer exit stats: +// CHECK: Suppression count: 7 + +// CHECK-NOSUPPRESSIONS: malloc +// CHECK-NOSUPPRESSIONS: vector +// CHECK-NOSUPPRESSIONS: free +// CHECK-NOSUPPRESSIONS: BlockFunc +// CHECK-NOSUPPRESSIONS: usleep diff --git a/compiler-rt/test/rtsan/stack_suppressions.cpp.supp b/compiler-rt/test/rtsan/stack_suppressions.cpp.supp index bec4db259a3e0eabf20211b2658daaa8bf567932..9aaa5a5f089091e1811b91ccb346653e2eef9a1a 100644 --- a/compiler-rt/test/rtsan/stack_suppressions.cpp.supp +++ b/compiler-rt/test/rtsan/stack_suppressions.cpp.supp @@ -1,4 +1,5 @@ call-stack-contains:MallocViolation call-stack-contains:std::*vector -call-stack-contains:free -call-stack-contains:BlockFunc + +function-name-matches:free +function-name-matches:Block* diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp index 284b4ba0abe47bc94bf6c8d6e2bf5dcc177a2d8e..515102715a660532f1cd445a90f8fc3fbcc0a7dc 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp +++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp @@ -1,9 +1,6 @@ // RUN: %clangxx -fsanitize=integer -fsanitize-recover=integer %s -o %t // RUN: not %run %t 2>&1 | FileCheck %s -// __ubsan_default_options() doesn't work on Darwin. -// XFAIL: darwin - #include extern "C" const char *__ubsan_default_options() { diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 3ffd2949e45bf40c5acebaf2bc699940c1ffb6be..f85a3eb39ed191603d74f9657dc116530fbf9ff4 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -389,6 +389,8 @@ end * A local data object may appear in a specification expression, even when it is not a dummy argument or in COMMON, so long as it is has the SAVE attribute and was initialized. +* `PRINT namelistname` is accepted and interpreted as + `WRITE(*,NML=namelistname)`, a near-universal extension. ### Extensions supported when enabled by options diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md index a8bd93517709ddc33d51bb454224e4e9b128cf25..e2220f36946699b8987feaa2793ad3b40fad81c3 100644 --- a/flang/docs/GettingInvolved.md +++ b/flang/docs/GettingInvolved.md @@ -17,6 +17,11 @@ The Flang Project welcomes contributions of all kinds. Please feel free to join the mailing list or the slack channel for discussions related to development of Flang. To understand the status of various developments in Flang please join the respective call. +## Contributing + +Contributions to Flang are done using GitHub Pull Requests and follow the +[LLVM contribution process](https://llvm.org/docs/Contributing.html). + ## Forum and Mailing Lists [Forum](https://discourse.llvm.org/c/subprojects/flang) @@ -27,8 +32,7 @@ To understand the status of various developments in Flang please join the respec [Commits Archive (flang-commits)](http://lists.llvm.org/pipermail/flang-commits) This list contains all commit messages that are made when Flang developers - commit code changes to the repository. It also serves as a forum for - patch review (i.e. send patches here). It is useful for those who want to + commit code changes to the repository. It is useful for those who want to stay on the bleeding edge of Flang development. This list is high volume. diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h index 3942a79262864564361ba490ec8d036c996c51a9..2b57c7ae50642c8aa88200790e16925f6754f0e4 100644 --- a/flang/include/flang/Common/Fortran-features.h +++ b/flang/include/flang/Common/Fortran-features.h @@ -53,7 +53,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, NonBindCInteroperability, CudaManaged, CudaUnified, PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy, UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr, - SavedLocalInSpecExpr) + SavedLocalInSpecExpr, PrintNamelist) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, @@ -63,9 +63,9 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, F202XAllocatableBreakingChange, OptionalMustBePresent, CommonBlockPadding, LogicalVsCBool, BindCCharLength, ProcDummyArgShapes, ExternalNameConflict, FoldingException, FoldingAvoidsRuntimeCrash, FoldingValueChecks, - FoldingFailure, FoldingLimit, Interoperability, Bounds, Preprocessing, - Scanning, OpenAccUsage, ProcPointerCompatibility, VoidMold, - KnownBadImplicitInterface, EmptyCase, CaseOverflow, CUDAUsage, + FoldingFailure, FoldingLimit, Interoperability, CharacterInteroperability, + Bounds, Preprocessing, Scanning, OpenAccUsage, ProcPointerCompatibility, + VoidMold, KnownBadImplicitInterface, EmptyCase, CaseOverflow, CUDAUsage, IgnoreTKRUsage, ExternalInterfaceMismatch, DefinedOperatorArgs, Final, ZeroDoStep, UnusedForallIndex, OpenMPUsage, ModuleFile, DataLength, IgnoredDirective, HomonymousSpecific, HomonymousResult, diff --git a/flang/include/flang/Common/erfc-scaled.h b/flang/include/flang/Common/erfc-scaled.h new file mode 100644 index 0000000000000000000000000000000000000000..a1bf3ea0f09251d660777a2ef47496dd3f30250d --- /dev/null +++ b/flang/include/flang/Common/erfc-scaled.h @@ -0,0 +1,116 @@ +//===-- include/flang/Common/erfc-scaled.h-----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_COMMON_ERFC_SCALED_H_ +#define FORTRAN_COMMON_ERFC_SCALED_H_ + +namespace Fortran::common { +template inline T ErfcScaled(T arg) { + // Coefficients for approximation to erfc in the first interval. + static const T a[5] = {3.16112374387056560e00, 1.13864154151050156e02, + 3.77485237685302021e02, 3.20937758913846947e03, 1.85777706184603153e-1}; + static const T b[4] = {2.36012909523441209e01, 2.44024637934444173e02, + 1.28261652607737228e03, 2.84423683343917062e03}; + + // Coefficients for approximation to erfc in the second interval. + static const T c[9] = {5.64188496988670089e-1, 8.88314979438837594e00, + 6.61191906371416295e01, 2.98635138197400131e02, 8.81952221241769090e02, + 1.71204761263407058e03, 2.05107837782607147e03, 1.23033935479799725e03, + 2.15311535474403846e-8}; + static const T d[8] = {1.57449261107098347e01, 1.17693950891312499e02, + 5.37181101862009858e02, 1.62138957456669019e03, 3.29079923573345963e03, + 4.36261909014324716e03, 3.43936767414372164e03, 1.23033935480374942e03}; + + // Coefficients for approximation to erfc in the third interval. + static const T p[6] = {3.05326634961232344e-1, 3.60344899949804439e-1, + 1.25781726111229246e-1, 1.60837851487422766e-2, 6.58749161529837803e-4, + 1.63153871373020978e-2}; + static const T q[5] = {2.56852019228982242e00, 1.87295284992346047e00, + 5.27905102951428412e-1, 6.05183413124413191e-2, 2.33520497626869185e-3}; + + constexpr T sqrtpi{1.7724538509078120380404576221783883301349L}; + constexpr T rsqrtpi{0.5641895835477562869480794515607725858440L}; + constexpr T epsilonby2{std::numeric_limits::epsilon() * 0.5}; + constexpr T xneg{-26.628e0}; + constexpr T xhuge{6.71e7}; + constexpr T thresh{0.46875e0}; + constexpr T zero{0.0}; + constexpr T one{1.0}; + constexpr T four{4.0}; + constexpr T sixteen{16.0}; + constexpr T xmax{1.0 / (sqrtpi * std::numeric_limits::min())}; + static_assert(xmax > xhuge, "xmax must be greater than xhuge"); + + T ysq; + T xnum; + T xden; + T del; + T result; + + auto x{arg}; + auto y{std::fabs(x)}; + + if (y <= thresh) { + // evaluate erf for |x| <= 0.46875 + ysq = zero; + if (y > epsilonby2) { + ysq = y * y; + } + xnum = a[4] * ysq; + xden = ysq; + for (int i{0}; i < 3; i++) { + xnum = (xnum + a[i]) * ysq; + xden = (xden + b[i]) * ysq; + } + result = x * (xnum + a[3]) / (xden + b[3]); + result = one - result; + result = std::exp(ysq) * result; + return result; + } else if (y <= four) { + // evaluate erfc for 0.46875 < |x| <= 4.0 + xnum = c[8] * y; + xden = y; + for (int i{0}; i < 7; ++i) { + xnum = (xnum + c[i]) * y; + xden = (xden + d[i]) * y; + } + result = (xnum + c[7]) / (xden + d[7]); + } else { + // evaluate erfc for |x| > 4.0 + result = zero; + if (y >= xhuge) { + if (y < xmax) { + result = rsqrtpi / y; + } + } else { + ysq = one / (y * y); + xnum = p[5] * ysq; + xden = ysq; + for (int i{0}; i < 4; ++i) { + xnum = (xnum + p[i]) * ysq; + xden = (xden + q[i]) * ysq; + } + result = ysq * (xnum + p[4]) / (xden + q[4]); + result = (rsqrtpi - result) / y; + } + } + // fix up for negative argument, erf, etc. + if (x < zero) { + if (x < xneg) { + result = std::numeric_limits::max(); + } else { + ysq = trunc(x * sixteen) / sixteen; + del = (x - ysq) * (x + ysq); + y = std::exp((ysq * ysq)) * std::exp((del)); + result = (y + y) - result; + } + } + return result; +} +} // namespace Fortran::common +#endif // FORTRAN_COMMON_ERFC_SCALED_H_ diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index d2887b69cc6de1c405ada68e29e6ed3b5b46aba4..f547138f5a116c52821a2cbd3a2af98455dc0809 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -1252,8 +1252,12 @@ private: // Predicate: should two expressions be considered identical for the purposes // of determining whether two procedure interfaces are compatible, modulo // naming of corresponding dummy arguments? -std::optional AreEquivalentInInterface( +template +std::optional AreEquivalentInInterface(const Expr &, const Expr &); +extern template std::optional AreEquivalentInInterface( const Expr &, const Expr &); +extern template std::optional AreEquivalentInInterface( + const Expr &, const Expr &); bool CheckForCoindexedObject(parser::ContextualMessages &, const std::optional &, const std::string &procName, diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index f643674f1d5d6bc58ea242f0cfabab56801fdd61..98d1ef529738c761049fad4806b6b2595923b840 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -288,4 +288,23 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments, let hasVerifier = 1; } +def cuf_RegisterKernelOp : cuf_Op<"register_kernel", []> { + let summary = "Register a CUDA kernel"; + + let arguments = (ins + SymbolRefAttr:$name + ); + + let assemblyFormat = [{ + $name attr-dict + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + mlir::StringAttr getKernelName(); + mlir::StringAttr getKernelModuleName(); + }]; +} + #endif // FORTRAN_DIALECT_CUF_CUF_OPS diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index bf75123e853779fc953c0af1bbd304f425e5056e..af6bd41cbb71daec6fa78e0773d3a57bb7e72a8b 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -439,7 +439,7 @@ def CufImplicitDeviceGlobal : def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> { let summary = "Add constructor to register CUDA Fortran allocators"; let dependentDialects = [ - "mlir::func::FuncDialect" + "cuf::CUFDialect", "mlir::func::FuncDialect" ]; } diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 5d243b4e5d3e9a522835f6809a8b41f0dfcb3070..ccbe5475d051e03905087eb5d1e7f8d617adf5e2 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -884,8 +884,10 @@ protected: } else if constexpr (HasSource::value) { return x.source.ToString(); #endif - } else if constexpr (std::is_same_v) { - return x; + } else if constexpr (std::is_same_v) { + return std::to_string(x); + } else if constexpr (std::is_same_v) { + return x ? "true" : "false"; } else { return ""; } diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h index e2131e7e160cb607426013028d8ab1d78fc845b4..3522191502059586893ca4496d468edd517c6a5e 100644 --- a/flang/include/flang/Semantics/type.h +++ b/flang/include/flang/Semantics/type.h @@ -29,6 +29,13 @@ namespace Fortran::parser { struct Keyword; } +namespace Fortran::evaluate { // avoid including all of Evaluate/tools.h +template +std::optional AreEquivalentInInterface(const Expr &, const Expr &); +extern template std::optional AreEquivalentInInterface( + const Expr &, const Expr &); +} // namespace Fortran::evaluate + namespace Fortran::semantics { class Scope; @@ -110,6 +117,11 @@ public: return category_ == that.category_ && expr_ == that.expr_; } bool operator!=(const ParamValue &that) const { return !(*this == that); } + bool IsEquivalentInInterface(const ParamValue &that) const { + return (category_ == that.category_ && + expr_.has_value() == that.expr_.has_value() && + (!expr_ || evaluate::AreEquivalentInInterface(*expr_, *that.expr_))); + } std::string AsFortran() const; private: diff --git a/flang/lib/Common/Fortran-features.cpp b/flang/lib/Common/Fortran-features.cpp index 59f570e6ab6e938f89f6dbef2b537a563be99acc..a53f32d74dc37debd94bc3ac25daf8a8f0944d4b 100644 --- a/flang/lib/Common/Fortran-features.cpp +++ b/flang/lib/Common/Fortran-features.cpp @@ -48,6 +48,7 @@ LanguageFeatureControl::LanguageFeatureControl() { warnUsage_.set(UsageWarning::FoldingFailure); warnUsage_.set(UsageWarning::FoldingLimit); warnUsage_.set(UsageWarning::Interoperability); + // CharacterInteroperability warnings about length are off by default warnUsage_.set(UsageWarning::Bounds); warnUsage_.set(UsageWarning::Preprocessing); warnUsage_.set(UsageWarning::Scanning); diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp index ce9dd6b7b3df864c52c1f4df10f90e4faaee43ad..bb439a6bb3a746b3918555520765535315ccd265 100644 --- a/flang/lib/Evaluate/intrinsics-library.cpp +++ b/flang/lib/Evaluate/intrinsics-library.cpp @@ -14,6 +14,7 @@ #include "flang/Evaluate/intrinsics-library.h" #include "fold-implementation.h" #include "host.h" +#include "flang/Common/erfc-scaled.h" #include "flang/Common/static-multimap-view.h" #include "flang/Evaluate/expression.h" #include @@ -231,6 +232,7 @@ struct HostRuntimeLibrary { FolderFactory::Create("cosh"), FolderFactory::Create("erf"), FolderFactory::Create("erfc"), + FolderFactory::Create("erfc_scaled"), FolderFactory::Create("exp"), FolderFactory::Create("gamma"), FolderFactory::Create("log"), @@ -415,7 +417,7 @@ template <> struct HostRuntimeLibrary { static_assert(map.Verify(), "map must be sorted"); }; -#if HAS_FLOAT80 || HAS_LDBL128 +#if defined(__GLIBC__) && (HAS_FLOAT80 || HAS_LDBL128) template <> struct HostRuntimeLibrary { using F = FuncPointer; diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 1f48fc21662ebbfcc316e64a633b1f3607a676e2..4271faa0db12bfbef86f654a7772e5639ec3986c 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -2861,12 +2861,22 @@ IntrinsicProcTable::Implementation::HandleC_F_Pointer( } } else if (!IsInteroperableIntrinsicType( *type, &context.languageFeatures()) - .value_or(true) && - context.languageFeatures().ShouldWarn( - common::UsageWarning::Interoperability)) { - context.messages().Say(common::UsageWarning::Interoperability, at, - "FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type %s"_warn_en_US, - type->AsFortran()); + .value_or(true)) { + if (type->category() == TypeCategory::Character && + type->kind() == 1) { + if (context.languageFeatures().ShouldWarn( + common::UsageWarning::CharacterInteroperability)) { + context.messages().Say( + common::UsageWarning::CharacterInteroperability, at, + "FPTR= argument to C_F_POINTER() should not have the non-interoperable character length %s"_warn_en_US, + type->AsFortran()); + } + } else if (context.languageFeatures().ShouldWarn( + common::UsageWarning::Interoperability)) { + context.messages().Say(common::UsageWarning::Interoperability, at, + "FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type or kind %s"_warn_en_US, + type->AsFortran()); + } } if (ExtractCoarrayRef(*expr)) { context.messages().Say(at, @@ -2963,12 +2973,23 @@ std::optional IntrinsicProcTable::Implementation::HandleC_Loc( context.messages().Say(arguments[0]->sourceLocation(), "C_LOC() argument may not be zero-length character"_err_en_US); } else if (typeAndShape->type().category() != TypeCategory::Derived && - !IsInteroperableIntrinsicType(typeAndShape->type()).value_or(true) && - context.languageFeatures().ShouldWarn( - common::UsageWarning::Interoperability)) { - context.messages().Say(common::UsageWarning::Interoperability, - arguments[0]->sourceLocation(), - "C_LOC() argument has non-interoperable intrinsic type, kind, or length"_warn_en_US); + !IsInteroperableIntrinsicType(typeAndShape->type()).value_or(true)) { + if (typeAndShape->type().category() == TypeCategory::Character && + typeAndShape->type().kind() == 1) { + // Default character kind, but length is not known to be 1 + if (context.languageFeatures().ShouldWarn( + common::UsageWarning::CharacterInteroperability)) { + context.messages().Say( + common::UsageWarning::CharacterInteroperability, + arguments[0]->sourceLocation(), + "C_LOC() argument has non-interoperable character length"_warn_en_US); + } + } else if (context.languageFeatures().ShouldWarn( + common::UsageWarning::Interoperability)) { + context.messages().Say(common::UsageWarning::Interoperability, + arguments[0]->sourceLocation(), + "C_LOC() argument has non-interoperable intrinsic type or kind"_warn_en_US); + } } characteristics::DummyDataObject ddo{std::move(*typeAndShape)}; diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp index c2545a87099426e00bfd5444117a3aab0144bfe3..4d98220a7065ca183cc415b7275af386cdab973f 100644 --- a/flang/lib/Evaluate/tools.cpp +++ b/flang/lib/Evaluate/tools.cpp @@ -1320,8 +1320,10 @@ std::optional> HollerithToBOZ(FoldingContext &context, // Extracts a whole symbol being used as a bound of a dummy argument, // possibly wrapped with parentheses or MAX(0, ...). +// Works with any integer expression. +template const Symbol *GetBoundSymbol(const Expr &); template -static const Symbol *GetBoundSymbol( +const Symbol *GetBoundSymbol( const Expr> &expr) { using T = Type; return common::visit( @@ -1358,9 +1360,15 @@ static const Symbol *GetBoundSymbol( }, expr.u); } +template <> +const Symbol *GetBoundSymbol(const Expr &expr) { + return common::visit( + [](const auto &kindExpr) { return GetBoundSymbol(kindExpr); }, expr.u); +} +template std::optional AreEquivalentInInterface( - const Expr &x, const Expr &y) { + const Expr &x, const Expr &y) { auto xVal{ToInt64(x)}; auto yVal{ToInt64(y)}; if (xVal && yVal) { @@ -1394,6 +1402,10 @@ std::optional AreEquivalentInInterface( return std::nullopt; // not sure } } +template std::optional AreEquivalentInInterface( + const Expr &, const Expr &); +template std::optional AreEquivalentInInterface( + const Expr &, const Expr &); bool CheckForCoindexedObject(parser::ContextualMessages &messages, const std::optional &arg, const std::string &procName, diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp index a1df40667471ad6f140cef289df9854166e330da..c00688853cd0069008fb24b93488706f2f6c0fbf 100644 --- a/flang/lib/Evaluate/type.cpp +++ b/flang/lib/Evaluate/type.cpp @@ -518,7 +518,10 @@ static bool AreSameDerivedType( bool DynamicType::IsEquivalentTo(const DynamicType &that) const { return category_ == that.category_ && kind_ == that.kind_ && - PointeeComparison(charLengthParamValue_, that.charLengthParamValue_) && + (charLengthParamValue_ == that.charLengthParamValue_ || + (charLengthParamValue_ && that.charLengthParamValue_ && + charLengthParamValue_->IsEquivalentInInterface( + *that.charLengthParamValue_))) && knownLength().has_value() == that.knownLength().has_value() && (!knownLength() || *knownLength() == *that.knownLength()) && AreSameDerivedType(derived_, that.derived_); diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 70d89f5e521a59933650cbc3c21b20b26db49585..cf469003b7298d8a0e5a6a769d5975df0e721cf4 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2070,7 +2070,9 @@ static void genStandaloneSimd(lower::AbstractConverter &converter, loopNestClauseOps, iv); EntryBlockArgs simdArgs; - // TODO: Add private, reduction syms and vars. + // TODO: Add private syms and vars. + simdArgs.reduction.syms = simdReductionSyms; + simdArgs.reduction.vars = simdClauseOps.reductionVars; auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); @@ -2228,7 +2230,9 @@ static void genCompositeDistributeParallelDoSimd( wsloopOp.setComposite(/*val=*/true); EntryBlockArgs simdArgs; - // TODO: Add private, reduction syms and vars. + // TODO: Add private syms and vars. + simdArgs.reduction.syms = simdReductionSyms; + simdArgs.reduction.vars = simdClauseOps.reductionVars; auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); simdOp.setComposite(/*val=*/true); @@ -2285,7 +2289,9 @@ static void genCompositeDistributeSimd(lower::AbstractConverter &converter, distributeOp.setComposite(/*val=*/true); EntryBlockArgs simdArgs; - // TODO: Add private, reduction syms and vars. + // TODO: Add private syms and vars. + simdArgs.reduction.syms = simdReductionSyms; + simdArgs.reduction.vars = simdClauseOps.reductionVars; auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); simdOp.setComposite(/*val=*/true); @@ -2342,7 +2348,9 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, wsloopOp.setComposite(/*val=*/true); EntryBlockArgs simdArgs; - // TODO: Add private, reduction syms and vars. + // TODO: Add private syms and vars. + simdArgs.reduction.syms = simdReductionSyms; + simdArgs.reduction.vars = simdClauseOps.reductionVars; auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); simdOp.setComposite(/*val=*/true); diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 9b624efa05381372d2095f83dbcb94eb8afb318b..68b8c6613585e6e6e91a74833205911615de7064 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -3373,19 +3373,7 @@ struct AbsentOpConversion : public fir::FIROpConversion { matchAndRewrite(fir::AbsentOp absent, OpAdaptor, mlir::ConversionPatternRewriter &rewriter) const override { mlir::Type ty = convertType(absent.getType()); - mlir::Location loc = absent.getLoc(); - - if (mlir::isa(absent.getType())) { - auto structTy = mlir::cast(ty); - assert(!structTy.isOpaque() && !structTy.getBody().empty()); - auto undefStruct = rewriter.create(loc, ty); - auto nullField = - rewriter.create(loc, structTy.getBody()[0]); - rewriter.replaceOpWithNewOp( - absent, undefStruct, nullField, 0); - } else { - rewriter.replaceOpWithNewOp(absent, ty); - } + rewriter.replaceOpWithNewOp(absent, ty); return mlir::success(); } }; diff --git a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt index 83d468bafdfeb6ddbe88fcc2aca22d19de4da954..b2221199995d58c51209b71a13666d20c131b60d 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt @@ -14,6 +14,7 @@ add_flang_library(CUFDialect FIRDialect FIRDialectSupport MLIRIR + MLIRGPUDialect MLIRTargetLLVMIRExport LINK_COMPONENTS diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp index 7fb2dcf4af115c002a99471dadfa0bcd5daef2f7..0b03e070a0076e8c5d6ef10618bf2a9b693f492e 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp +++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp @@ -15,6 +15,8 @@ #include "flang/Optimizer/Dialect/CUF/CUFDialect.h" #include "flang/Optimizer/Dialect/FIRAttr.h" #include "flang/Optimizer/Dialect/FIRType.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" @@ -253,6 +255,50 @@ llvm::LogicalResult cuf::KernelOp::verify() { return mlir::success(); } +//===----------------------------------------------------------------------===// +// RegisterKernelOp +//===----------------------------------------------------------------------===// + +mlir::StringAttr cuf::RegisterKernelOp::getKernelModuleName() { + return getName().getRootReference(); +} + +mlir::StringAttr cuf::RegisterKernelOp::getKernelName() { + return getName().getLeafReference(); +} + +mlir::LogicalResult cuf::RegisterKernelOp::verify() { + if (getKernelName() == getKernelModuleName()) + return emitOpError("expect a module and a kernel name"); + + auto mod = getOperation()->getParentOfType(); + if (!mod) + return emitOpError("expect to be in a module"); + + mlir::SymbolTable symTab(mod); + auto gpuMod = symTab.lookup(getKernelModuleName()); + if (!gpuMod) { + // If already a gpu.binary then stop the check here. + if (symTab.lookup(getKernelModuleName())) + return mlir::success(); + return emitOpError("gpu module not found"); + } + + mlir::SymbolTable gpuSymTab(gpuMod); + if (auto func = gpuSymTab.lookup(getKernelName())) { + if (!func.isKernel()) + return emitOpError("only kernel gpu.func can be registered"); + return mlir::success(); + } else if (auto func = + gpuSymTab.lookup(getKernelName())) { + if (!func->getAttrOfType( + mlir::gpu::GPUDialect::getKernelFuncAttrName())) + return emitOpError("only gpu.kernel llvm.func can be registered"); + return mlir::success(); + } + return emitOpError("device function not found"); +} + // Tablegen operators #define GET_OP_CLASSES diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 5e1a0293e63c978a8c709c8a6de2d905808e157f..352fe4cbe09e99c8a9d0b44785107e0241c19b45 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -49,6 +49,7 @@ add_flang_library(FIRTransforms HLFIRDialect MLIRAffineUtils MLIRFuncDialect + MLIRGPUDialect MLIRLLVMDialect MLIRLLVMCommonConversion MLIRMathTransforms diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 48620fbc585861e9612930375b92982959f37b64..3db24226e750423b6535034c34893b52e34b2306 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -12,6 +12,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Runtime/entry-names.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Pass/Pass.h" #include "llvm/ADT/SmallVector.h" @@ -23,6 +24,8 @@ namespace fir { namespace { +static constexpr llvm::StringRef cudaModName{"cuda_device_mod"}; + static constexpr llvm::StringRef cudaFortranCtorName{ "__cudaFortranConstructor"}; @@ -31,6 +34,7 @@ struct CUFAddConstructor void runOnOperation() override { mlir::ModuleOp mod = getOperation(); + mlir::SymbolTable symTab(mod); mlir::OpBuilder builder{mod.getBodyRegion()}; builder.setInsertionPointToEnd(mod.getBody()); mlir::Location loc = mod.getLoc(); @@ -48,13 +52,25 @@ struct CUFAddConstructor mod.getContext(), RTNAME_STRING(CUFRegisterAllocator)); builder.setInsertionPointToEnd(mod.getBody()); - // Create the constructor function that cal CUFRegisterAllocator. - builder.setInsertionPointToEnd(mod.getBody()); + // Create the constructor function that call CUFRegisterAllocator. auto func = builder.create(loc, cudaFortranCtorName, funcTy); func.setLinkage(mlir::LLVM::Linkage::Internal); builder.setInsertionPointToStart(func.addEntryBlock(builder)); builder.create(loc, funcTy, cufRegisterAllocatorRef); + + // Register kernels + auto gpuMod = symTab.lookup(cudaModName); + if (gpuMod) { + for (auto func : gpuMod.getOps()) { + if (func.isKernel()) { + auto kernelName = mlir::SymbolRefAttr::get( + builder.getStringAttr(cudaModName), + {mlir::SymbolRefAttr::get(builder.getContext(), func.getName())}); + builder.create(loc, kernelName); + } + } + } builder.create(loc, mlir::ValueRange{}); // Create the llvm.global_ctor with the function. diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp index d8448e4c527ac71fcc9bd37320236d967b61a1df..e2381a6b8ffa3e167192bad2d0fddf3117df70ce 100644 --- a/flang/lib/Parser/parsing.cpp +++ b/flang/lib/Parser/parsing.cpp @@ -75,6 +75,7 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) { messages_, *currentCooked_, preprocessor_, options.features}; prescanner.set_fixedForm(options.isFixedForm) .set_fixedFormColumnLimit(options.fixedFormColumns) + .set_preprocessingOnly(options.prescanAndReformat) .set_expandIncludeLines(!options.prescanAndReformat || options.expandIncludeLinesInPreprocessedOutput) .AddCompilerDirectiveSentinel("dir$"); diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 47260c0680463d8418836b3e52a6ceea61be2538..1d2f1e9766879232dd118e3cf9cdf6d0bb4716f7 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -36,6 +36,8 @@ Prescanner::Prescanner(const Prescanner &that, Preprocessor &prepro, bool isNestedInIncludeDirective) : messages_{that.messages_}, cooked_{that.cooked_}, preprocessor_{prepro}, allSources_{that.allSources_}, features_{that.features_}, + preprocessingOnly_{that.preprocessingOnly_}, + expandIncludeLines_{that.expandIncludeLines_}, isNestedInIncludeDirective_{isNestedInIncludeDirective}, backslashFreeFormContinuation_{that.backslashFreeFormContinuation_}, inFixedForm_{that.inFixedForm_}, @@ -288,8 +290,8 @@ void Prescanner::Statement() { break; case LineClassification::Kind::Source: if (inFixedForm_) { - if (preprocessed->HasBlanks(/*after column*/ 6)) { - preprocessed->RemoveBlanks(/*after column*/ 6); + if (!preprocessingOnly_ && preprocessed->HasBlanks()) { + preprocessed->RemoveBlanks(); } } else { while (SourceLineContinuation(*preprocessed)) { @@ -622,7 +624,7 @@ const char *Prescanner::SkipCComment(const char *p) const { bool Prescanner::NextToken(TokenSequence &tokens) { CHECK(at_ >= start_ && at_ < limit_); - if (InFixedFormSource()) { + if (InFixedFormSource() && !preprocessingOnly_) { SkipSpaces(); } else { if (*at_ == '/' && IsCComment(at_)) { diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h index c50bf231e3c70563aceefd69c6555b00fbaa3c53..08041f93b14b6c771a83982f954e86bfa4dd19f4 100644 --- a/flang/lib/Parser/prescan.h +++ b/flang/lib/Parser/prescan.h @@ -48,6 +48,10 @@ public: Preprocessor &preprocessor() { return preprocessor_; } common::LanguageFeatureControl &features() { return features_; } + Prescanner &set_preprocessingOnly(bool yes) { + preprocessingOnly_ = yes; + return *this; + } Prescanner &set_expandIncludeLines(bool yes) { expandIncludeLines_ = yes; return *this; @@ -213,6 +217,7 @@ private: Preprocessor &preprocessor_; AllSources &allSources_; common::LanguageFeatureControl features_; + bool preprocessingOnly_{false}; bool expandIncludeLines_{true}; bool isNestedInIncludeDirective_{false}; bool backslashFreeFormContinuation_{false}; diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 5cdc8602cf4c082aba8c648968019ec4d6234dfe..85a4ed7821ec197f207cfeb6a4c5854ab89799f4 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -68,11 +68,23 @@ public: if (const auto *e{GetExpr(context_, expr)}) { for (const Symbol &symbol : evaluate::CollectSymbols(*e)) { const Symbol &root{GetAssociationRoot(symbol)}; - if (IsFunction(root) && !IsElementalProcedure(root)) { - context_.Say(expr.source, - "User defined non-ELEMENTAL function " - "'%s' is not allowed in a WORKSHARE construct"_err_en_US, - root.name()); + if (IsFunction(root)) { + std::string attrs{""}; + if (!IsElementalProcedure(root)) { + attrs = " non-ELEMENTAL"; + } + if (root.attrs().test(Attr::IMPURE)) { + if (attrs != "") { + attrs = "," + attrs; + } + attrs = " IMPURE" + attrs; + } + if (attrs != "") { + context_.Say(expr.source, + "User defined%s function '%s' is not allowed in a " + "WORKSHARE construct"_err_en_US, + attrs, root.name()); + } } } } @@ -2273,6 +2285,21 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) { } } } + + // 2.11.5 Simd construct restriction (OpenMP 5.1) + if (auto *sl_clause{FindClause(llvm::omp::Clause::OMPC_safelen)}) { + if (auto *o_clause{FindClause(llvm::omp::Clause::OMPC_order)}) { + const auto &orderClause{ + std::get(o_clause->u)}; + if (std::get(orderClause.v.t) == + parser::OmpOrderClause::Type::Concurrent) { + context_.Say(sl_clause->source, + "The `SAFELEN` clause cannot appear in the `SIMD` directive " + "with `ORDER(CONCURRENT)` clause"_err_en_US); + } + } + } + // Sema checks related to presence of multiple list items within the same // clause CheckMultListItems(); diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index f1ce0b415ebe9c4c6ee9239de4d97f227d810e17..2fa5b75e073b6313786341e648b3d5a425b8c332 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -720,6 +720,7 @@ protected: bool inSpecificationPart_{false}; bool deferImplicitTyping_{false}; + bool skipImplicitTyping_{false}; bool inEquivalenceStmt_{false}; // Some information is collected from a specification part for deferred @@ -758,6 +759,10 @@ protected: } } + void SkipImplicitTyping(bool skip) { + deferImplicitTyping_ = skipImplicitTyping_ = skip; + } + private: Scope *currScope_{nullptr}; FuncResultStack funcResultStack_{*this}; @@ -1506,6 +1511,25 @@ public: void Post(const parser::OmpEndCriticalDirective &) { messageHandler().set_currStmtSource(std::nullopt); } + bool Pre(const parser::OpenMPThreadprivate &) { + SkipImplicitTyping(true); + return true; + } + void Post(const parser::OpenMPThreadprivate &) { SkipImplicitTyping(false); } + bool Pre(const parser::OpenMPDeclareTargetConstruct &) { + SkipImplicitTyping(true); + return true; + } + void Post(const parser::OpenMPDeclareTargetConstruct &) { + SkipImplicitTyping(false); + } + bool Pre(const parser::OpenMPDeclarativeAllocate &) { + SkipImplicitTyping(true); + return true; + } + void Post(const parser::OpenMPDeclarativeAllocate &) { + SkipImplicitTyping(false); + } }; bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) { @@ -2557,8 +2581,10 @@ void ScopeHandler::ApplyImplicitRules( return; } if (const DeclTypeSpec * type{GetImplicitType(symbol)}) { - symbol.set(Symbol::Flag::Implicit); - symbol.SetType(*type); + if (!skipImplicitTyping_) { + symbol.set(Symbol::Flag::Implicit); + symbol.SetType(*type); + } return; } if (symbol.has() && !symbol.attrs().test(Attr::EXTERNAL)) { diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp index b4fb72ce213017c32f96475bd97a66e3be418c2c..c90ae66342840ed059213c198b0942b8e16fab08 100644 --- a/flang/lib/Semantics/rewrite-parse-tree.cpp +++ b/flang/lib/Semantics/rewrite-parse-tree.cpp @@ -32,7 +32,7 @@ using namespace parser::literals; class RewriteMutator { public: RewriteMutator(SemanticsContext &context) - : errorOnUnresolvedName_{!context.AnyFatalError()}, + : context_{context}, errorOnUnresolvedName_{!context.AnyFatalError()}, messages_{context.messages()} {} // Default action for a parse tree node is to visit children. @@ -42,6 +42,7 @@ public: void Post(parser::Name &); void Post(parser::SpecificationPart &); bool Pre(parser::ExecutionPart &); + bool Pre(parser::ActionStmt &); void Post(parser::ReadStmt &); void Post(parser::WriteStmt &); @@ -66,6 +67,7 @@ public: private: using stmtFuncType = parser::Statement>; + SemanticsContext &context_; bool errorOnUnresolvedName_{true}; parser::Messages &messages_; std::list stmtFuncsToConvert_; @@ -130,6 +132,29 @@ bool RewriteMutator::Pre(parser::ExecutionPart &x) { return true; } +// Rewrite PRINT NML -> WRITE(*,NML=NML) +bool RewriteMutator::Pre(parser::ActionStmt &x) { + if (auto *print{std::get_if>(&x.u)}; + print && + std::get>(print->value().t).empty()) { + auto &format{std::get(print->value().t)}; + if (std::holds_alternative(format.u)) { + if (auto *name{parser::Unwrap(format)}; name && + name->symbol && name->symbol->GetUltimate().has() && + context_.IsEnabled(common::LanguageFeature::PrintNamelist)) { + context_.Warn(common::LanguageFeature::PrintNamelist, name->source, + "nonstandard: namelist in PRINT statement"_port_en_US); + std::list controls; + controls.emplace_back(std::move(*name)); + x.u = common::Indirection::Make( + parser::IoUnit{parser::Star{}}, std::optional{}, + std::move(controls), std::list{}); + } + } + } + return true; +} + // When a namelist group name appears (without NML=) in a READ or WRITE // statement in such a way that it can be misparsed as a format expression, // rewrite the I/O statement's parse tree node as if the namelist group diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h index 90a983b787f537297023f91ca2aac3c0fe4143b7..4600c726d72825520972851c8a3451b895f15565 100644 --- a/flang/runtime/Float128Math/math-entries.h +++ b/flang/runtime/Float128Math/math-entries.h @@ -187,9 +187,6 @@ DEFINE_SIMPLE_ALIAS(Hypot, std::hypot) DEFINE_SIMPLE_ALIAS(Ilogb, std::ilogb) DEFINE_SIMPLE_ALIAS(Isinf, std::isinf) DEFINE_SIMPLE_ALIAS(Isnan, std::isnan) -DEFINE_SIMPLE_ALIAS(J0, j0l) -DEFINE_SIMPLE_ALIAS(J1, j1l) -DEFINE_SIMPLE_ALIAS(Jn, jnl) DEFINE_SIMPLE_ALIAS(Ldexp, std::ldexp) DEFINE_SIMPLE_ALIAS(Lgamma, std::lgamma) DEFINE_SIMPLE_ALIAS(Llround, std::llround) @@ -207,9 +204,15 @@ DEFINE_SIMPLE_ALIAS(Tan, std::tan) DEFINE_SIMPLE_ALIAS(Tanh, std::tanh) DEFINE_SIMPLE_ALIAS(Tgamma, std::tgamma) DEFINE_SIMPLE_ALIAS(Trunc, std::trunc) + +#if defined(__GLIBC__) && defined(_GNU_SOURCE) +DEFINE_SIMPLE_ALIAS(J0, j0l) +DEFINE_SIMPLE_ALIAS(J1, j1l) +DEFINE_SIMPLE_ALIAS(Jn, jnl) DEFINE_SIMPLE_ALIAS(Y0, y0l) DEFINE_SIMPLE_ALIAS(Y1, y1l) DEFINE_SIMPLE_ALIAS(Yn, ynl) +#endif // Use numeric_limits to produce infinity of the right type. #define F128_RT_INFINITY \ diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h index 0b00bbb94ddd2172baccb9cb7f92c631af0cde58..fbb371bffc27a4600fea333d7f278f487664d47c 100644 --- a/flang/runtime/numeric-templates.h +++ b/flang/runtime/numeric-templates.h @@ -21,6 +21,7 @@ #include "terminator.h" #include "tools.h" #include "flang/Common/api-attrs.h" +#include "flang/Common/erfc-scaled.h" #include "flang/Common/float128.h" #include #include @@ -362,106 +363,7 @@ template inline RT_API_ATTRS T Spacing(T x) { // ERFC_SCALED (16.9.71) template inline RT_API_ATTRS T ErfcScaled(T arg) { - // Coefficients for approximation to erfc in the first interval. - static const T a[5] = {3.16112374387056560e00, 1.13864154151050156e02, - 3.77485237685302021e02, 3.20937758913846947e03, 1.85777706184603153e-1}; - static const T b[4] = {2.36012909523441209e01, 2.44024637934444173e02, - 1.28261652607737228e03, 2.84423683343917062e03}; - - // Coefficients for approximation to erfc in the second interval. - static const T c[9] = {5.64188496988670089e-1, 8.88314979438837594e00, - 6.61191906371416295e01, 2.98635138197400131e02, 8.81952221241769090e02, - 1.71204761263407058e03, 2.05107837782607147e03, 1.23033935479799725e03, - 2.15311535474403846e-8}; - static const T d[8] = {1.57449261107098347e01, 1.17693950891312499e02, - 5.37181101862009858e02, 1.62138957456669019e03, 3.29079923573345963e03, - 4.36261909014324716e03, 3.43936767414372164e03, 1.23033935480374942e03}; - - // Coefficients for approximation to erfc in the third interval. - static const T p[6] = {3.05326634961232344e-1, 3.60344899949804439e-1, - 1.25781726111229246e-1, 1.60837851487422766e-2, 6.58749161529837803e-4, - 1.63153871373020978e-2}; - static const T q[5] = {2.56852019228982242e00, 1.87295284992346047e00, - 5.27905102951428412e-1, 6.05183413124413191e-2, 2.33520497626869185e-3}; - - constexpr T sqrtpi{1.7724538509078120380404576221783883301349L}; - constexpr T rsqrtpi{0.5641895835477562869480794515607725858440L}; - constexpr T epsilonby2{std::numeric_limits::epsilon() * 0.5}; - constexpr T xneg{-26.628e0}; - constexpr T xhuge{6.71e7}; - constexpr T thresh{0.46875e0}; - constexpr T zero{0.0}; - constexpr T one{1.0}; - constexpr T four{4.0}; - constexpr T sixteen{16.0}; - constexpr T xmax{1.0 / (sqrtpi * std::numeric_limits::min())}; - static_assert(xmax > xhuge, "xmax must be greater than xhuge"); - - T ysq; - T xnum; - T xden; - T del; - T result; - - auto x{arg}; - auto y{std::fabs(x)}; - - if (y <= thresh) { - // evaluate erf for |x| <= 0.46875 - ysq = zero; - if (y > epsilonby2) { - ysq = y * y; - } - xnum = a[4] * ysq; - xden = ysq; - for (int i{0}; i < 3; i++) { - xnum = (xnum + a[i]) * ysq; - xden = (xden + b[i]) * ysq; - } - result = x * (xnum + a[3]) / (xden + b[3]); - result = one - result; - result = std::exp(ysq) * result; - return result; - } else if (y <= four) { - // evaluate erfc for 0.46875 < |x| <= 4.0 - xnum = c[8] * y; - xden = y; - for (int i{0}; i < 7; ++i) { - xnum = (xnum + c[i]) * y; - xden = (xden + d[i]) * y; - } - result = (xnum + c[7]) / (xden + d[7]); - } else { - // evaluate erfc for |x| > 4.0 - result = zero; - if (y >= xhuge) { - if (y < xmax) { - result = rsqrtpi / y; - } - } else { - ysq = one / (y * y); - xnum = p[5] * ysq; - xden = ysq; - for (int i{0}; i < 4; ++i) { - xnum = (xnum + p[i]) * ysq; - xden = (xden + q[i]) * ysq; - } - result = ysq * (xnum + p[4]) / (xden + q[4]); - result = (rsqrtpi - result) / y; - } - } - // fix up for negative argument, erf, etc. - if (x < zero) { - if (x < xneg) { - result = std::numeric_limits::max(); - } else { - ysq = trunc(x * sixteen) / sixteen; - del = (x - ysq) * (x + ysq); - y = std::exp((ysq * ysq)) * std::exp((del)); - result = (y + y) - result; - } - } - return result; + return common::ErfcScaled(arg); } } // namespace Fortran::runtime diff --git a/flang/test/Driver/atomic.f90 b/flang/test/Driver/atomic.f90 new file mode 100644 index 0000000000000000000000000000000000000000..0fb3b428f694c1fcb6af2045ff5377cfca8cd0ff --- /dev/null +++ b/flang/test/Driver/atomic.f90 @@ -0,0 +1,5 @@ +!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=libgcc -### %s 2>&1 | FileCheck --check-prefixes=GCC %s +!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=compiler-rt -### %s 2>&1 | FileCheck --check-prefixes=CRT %s + +!GCC: -latomic +!CRT-NOT: -latomic diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90 index 765917f07d8e72355166073dd19c2d1d6aed6200..1b7ecb604ad67d368dbd88da6ac00b628b3fd8ab 100644 --- a/flang/test/Driver/msvc-dependent-lib-flags.f90 +++ b/flang/test/Driver/msvc-dependent-lib-flags.f90 @@ -1,36 +1,36 @@ -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG - -! MSVC: -fc1 -! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-SAME: -D_MT -! MSVC-SAME: --dependent-lib=libcmt -! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib -! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib - -! MSVC-DEBUG: -fc1 -! MSVC-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-DEBUG-SAME: -D_MT -! MSVC-DEBUG-SAME: -D_DEBUG -! MSVC-DEBUG-SAME: --dependent-lib=libcmtd -! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib -! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib - -! MSVC-DLL: -fc1 -! MSVC-DLL-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-DLL-SAME: -D_MT -! MSVC-DLL-SAME: -D_DLL -! MSVC-DLL-SAME: --dependent-lib=msvcrt -! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib -! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib - -! MSVC-DLL-DEBUG: -fc1 -! MSVC-DLL-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-DLL-DEBUG-SAME: -D_MT -! MSVC-DLL-DEBUG-SAME: -D_DEBUG -! MSVC-DLL-DEBUG-SAME: -D_DLL -! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd -! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib -! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG + +! MSVC: -fc1 +! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-SAME: -D_MT +! MSVC-SAME: --dependent-lib=libcmt +! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib +! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib + +! MSVC-DEBUG: -fc1 +! MSVC-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-DEBUG-SAME: -D_MT +! MSVC-DEBUG-SAME: -D_DEBUG +! MSVC-DEBUG-SAME: --dependent-lib=libcmtd +! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib +! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib + +! MSVC-DLL: -fc1 +! MSVC-DLL-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-DLL-SAME: -D_MT +! MSVC-DLL-SAME: -D_DLL +! MSVC-DLL-SAME: --dependent-lib=msvcrt +! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib +! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib + +! MSVC-DLL-DEBUG: -fc1 +! MSVC-DLL-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-DLL-DEBUG-SAME: -D_MT +! MSVC-DLL-DEBUG-SAME: -D_DEBUG +! MSVC-DLL-DEBUG-SAME: -D_DLL +! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd +! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib +! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib diff --git a/flang/test/Evaluate/fold-erfc-scaled.f90 b/flang/test/Evaluate/fold-erfc-scaled.f90 new file mode 100644 index 0000000000000000000000000000000000000000..b38cd0157d0bad5b7a134f858cb08c1cb1eb1147 --- /dev/null +++ b/flang/test/Evaluate/fold-erfc-scaled.f90 @@ -0,0 +1,7 @@ +! RUN: %python %S/test_folding.py %s %flang_fc1 +module m + real(4), parameter :: x20_4 = erfc_scaled(20._4) + logical, parameter :: t20_4 = x20_4 == 0.02817435003817081451416015625_4 + real(8), parameter :: x20_8 = erfc_scaled(20._8) + logical, parameter :: t20_8 = x20_8 == 0.0281743487410513193669459042212110944092273712158203125_8 +end diff --git a/flang/test/Fir/CUDA/cuda-register-func.fir b/flang/test/Fir/CUDA/cuda-register-func.fir new file mode 100644 index 0000000000000000000000000000000000000000..277475f0883dcc68ab68c85eea6cf82dac38f593 --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-register-func.fir @@ -0,0 +1,16 @@ +// RUN: fir-opt --cuf-add-constructor %s | FileCheck %s + +module attributes {gpu.container_module} { + gpu.module @cuda_device_mod { + gpu.func @_QPsub_device1() kernel { + gpu.return + } + gpu.func @_QPsub_device2(%arg0: !fir.ref) kernel { + gpu.return + } + } +} + +// CHECK-LABEL: llvm.func internal @__cudaFortranConstructor() +// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device1 +// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device2 diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir index e9aeaa281e2a85b9b686b50e0281be2b7f74d9fe..8a1eb48576832c70a20fddcdacb45b3cbca731b9 100644 --- a/flang/test/Fir/cuf-invalid.fir +++ b/flang/test/Fir/cuf-invalid.fir @@ -125,3 +125,68 @@ func.func @_QPsub1(%arg0: !fir.ref> {cuf.data_attr = #cuf.cuda cuf.data_transfer %20#0 to %11#0, %19 : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer} : !fir.box>, !fir.box> return } + +// ----- + +module attributes {gpu.container_module} { + gpu.module @cuda_device_mod { + gpu.func @_QPsub_device1() { + gpu.return + } + } + llvm.func internal @__cudaFortranConstructor() { + // expected-error@+1{{'cuf.register_kernel' op only kernel gpu.func can be registered}} + cuf.register_kernel @cuda_device_mod::@_QPsub_device1 + llvm.return + } +} + +// ----- + +module attributes {gpu.container_module} { + gpu.module @cuda_device_mod { + gpu.func @_QPsub_device1() { + gpu.return + } + } + llvm.func internal @__cudaFortranConstructor() { + // expected-error@+1{{'cuf.register_kernel' op device function not found}} + cuf.register_kernel @cuda_device_mod::@_QPsub_device2 + llvm.return + } +} + +// ----- + +module attributes {gpu.container_module} { + llvm.func internal @__cudaFortranConstructor() { + // expected-error@+1{{'cuf.register_kernel' op gpu module not found}} + cuf.register_kernel @cuda_device_mod::@_QPsub_device1 + llvm.return + } +} + +// ----- + +module attributes {gpu.container_module} { + llvm.func internal @__cudaFortranConstructor() { + // expected-error@+1{{'cuf.register_kernel' op expect a module and a kernel name}} + cuf.register_kernel @_QPsub_device1 + llvm.return + } +} + +// ----- + +module attributes {gpu.container_module} { + gpu.module @cuda_device_mod { + llvm.func @_QPsub_device1() { + llvm.return + } + } + llvm.func internal @__cudaFortranConstructor() { + // expected-error@+1{{'cuf.register_kernel' op only gpu.kernel llvm.func can be registered}} + cuf.register_kernel @cuda_device_mod::@_QPsub_device1 + llvm.return + } +} diff --git a/flang/test/Fir/optional.fir b/flang/test/Fir/optional.fir index 3b350d6fa941957c3a122367c0095d8c502796f5..bded8b5332a3002b501ee8758a5aecb201d19e63 100644 --- a/flang/test/Fir/optional.fir +++ b/flang/test/Fir/optional.fir @@ -47,7 +47,7 @@ func.func @foo3(%arg0: !fir.boxchar<1>) -> i1 { // CHECK-LABEL: @bar3 func.func @bar3() -> i1 { %0 = fir.absent !fir.boxchar<1> - // CHECK: call i1 @foo3(ptr null, i64 undef) + // CHECK: call i1 @foo3(ptr null, i64 0) %1 = fir.call @foo3(%0) : (!fir.boxchar<1>) -> i1 return %1 : i1 } diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 new file mode 100644 index 0000000000000000000000000000000000000000..3aa5d0424639731bc9b7f7f888356d6c159a3e37 --- /dev/null +++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 @@ -0,0 +1,262 @@ +! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s + +! Combinational testing of control flow graph and builder insertion points +! in mlir-to-llvm conversion: +! - mixing multiple delayed privatizations and multiple reductions +! - multiple blocks in the private alloc region +! - private alloc region has to read from the mold variable +! - firstprivate +! - multiple blocks in the private copy region +! - multiple blocks in the reduction init region +! - reduction init region has to read from the mold variable +! - re-used omp.private ops +! - re-used omp.reduction.declare ops +! - unstructured code inside of the parallel region +! - needs private dealloc region, and this has multiple blocks +! - needs reduction cleanup region, and this has multiple blocks + +! This maybe belongs in the mlir tests, but what we are doing here is complex +! enough that I find the kind of minimised mlir code preferred by mlir reviewers +! hard to read without some fortran here for reference. Nothing like this would +! be generated by other upstream users of the MLIR OpenMP dialect. + +subroutine worst_case(a, b, c, d) + real, allocatable :: a(:), b(:), c(:), d(:) + integer i + + !$omp parallel firstprivate(a,b) reduction(+:c,d) + if (sum(a) == 1) stop 1 + !$omp end parallel +end subroutine + +! CHECK-LABEL: define internal void @worst_case_..omp_par +! CHECK-NEXT: omp.par.entry: +! [reduction alloc regions inlined here] +! CHECK: br label %omp.private.latealloc + +! CHECK: omp.private.latealloc: ; preds = %omp.par.entry +! CHECK-NEXT: br label %omp.private.alloc5 + +! CHECK: omp.private.alloc5: ; preds = %omp.private.latealloc +! [begin private alloc for first var] +! [read the length from the mold argument] +! [if it is non-zero...] +! CHECK: br i1 {{.*}}, label %omp.private.alloc6, label %omp.private.alloc7 + +! CHECK: omp.private.alloc7: ; preds = %omp.private.alloc5 +! [finish private alloc for first var with zero extent] +! CHECK: br label %omp.private.alloc8 + +! CHECK: omp.private.alloc8: ; preds = %omp.private.alloc6, %omp.private.alloc7 +! CHECK-NEXT: br label %omp.region.cont4 + +! CHECK: omp.region.cont4: ; preds = %omp.private.alloc8 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.private.alloc + +! CHECK: omp.private.alloc: ; preds = %omp.region.cont4 +! [begin private alloc for first var] +! [read the length from the mold argument] +! [if it is non-zero...] +! CHECK: br i1 %{{.*}}, label %omp.private.alloc1, label %omp.private.alloc2 + +! CHECK: omp.private.alloc2: ; preds = %omp.private.alloc +! [finish private alloc for second var with zero extent] +! CHECK: br label %omp.private.alloc3 + +! CHECK: omp.private.alloc3: ; preds = %omp.private.alloc1, %omp.private.alloc2 +! CHECK-NEXT: br label %omp.region.cont + +! CHECK: omp.region.cont: ; preds = %omp.private.alloc3 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.private.copy + +! CHECK: omp.private.copy: ; preds = %omp.region.cont +! CHECK-NEXT: br label %omp.private.copy10 + +! CHECK: omp.private.copy10: ; preds = %omp.private.copy +! [begin firstprivate copy for first var] +! [read the length, is it non-zero?] +! CHECK: br i1 %{{.*}}, label %omp.private.copy11, label %omp.private.copy12 + +! CHECK: omp.private.copy12: ; preds = %omp.private.copy11, %omp.private.copy10 +! CHECK-NEXT: br label %omp.region.cont9 + +! CHECK: omp.region.cont9: ; preds = %omp.private.copy12 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.private.copy14 + +! CHECK: omp.private.copy14: ; preds = %omp.region.cont9 +! [begin firstprivate copy for second var] +! [read the length, is it non-zero?] +! CHECK: br i1 %{{.*}}, label %omp.private.copy15, label %omp.private.copy16 + +! CHECK: omp.private.copy16: ; preds = %omp.private.copy15, %omp.private.copy14 +! CHECK-NEXT: br label %omp.region.cont13 + +! CHECK: omp.region.cont13: ; preds = %omp.private.copy16 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.reduction.init + +! CHECK: omp.reduction.init: ; preds = %omp.region.cont13 +! [deffered stores for results of reduction alloc regions] +! CHECK: br label %[[VAL_96:.*]] + +! CHECK: omp.reduction.neutral: ; preds = %omp.reduction.init +! [start of reduction initialization region] +! [null check:] +! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral18, label %omp.reduction.neutral19 + +! CHECK: omp.reduction.neutral19: ; preds = %omp.reduction.neutral +! [malloc and assign the default value to the reduction variable] +! CHECK: br label %omp.reduction.neutral20 + +! CHECK: omp.reduction.neutral20: ; preds = %omp.reduction.neutral18, %omp.reduction.neutral19 +! CHECK-NEXT: br label %omp.region.cont17 + +! CHECK: omp.region.cont17: ; preds = %omp.reduction.neutral20 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.reduction.neutral22 + +! CHECK: omp.reduction.neutral22: ; preds = %omp.region.cont17 +! [start of reduction initialization region] +! [null check:] +! CHECK: br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24 + +! CHECK: omp.reduction.neutral24: ; preds = %omp.reduction.neutral22 +! [malloc and assign the default value to the reduction variable] +! CHECK: br label %omp.reduction.neutral25 + +! CHECK: omp.reduction.neutral25: ; preds = %omp.reduction.neutral23, %omp.reduction.neutral24 +! CHECK-NEXT: br label %omp.region.cont21 + +! CHECK: omp.region.cont21: ; preds = %omp.reduction.neutral25 +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: br label %omp.par.region + +! CHECK: omp.par.region: ; preds = %omp.region.cont21 +! CHECK-NEXT: br label %omp.par.region27 + +! CHECK: omp.par.region27: ; preds = %omp.par.region +! [call SUM runtime function] +! [if (sum(a) == 1)] +! CHECK: br i1 %{{.*}}, label %omp.par.region28, label %omp.par.region29 + +! CHECK: omp.par.region29: ; preds = %omp.par.region27 +! CHECK-NEXT: br label %omp.region.cont26 + +! CHECK: omp.region.cont26: ; preds = %omp.par.region28, %omp.par.region29 +! [omp parallel region done, call into the runtime to complete reduction] +! CHECK: %[[VAL_233:.*]] = call i32 @__kmpc_reduce( +! CHECK: switch i32 %[[VAL_233]], label %reduce.finalize [ +! CHECK-NEXT: i32 1, label %reduce.switch.nonatomic +! CHECK-NEXT: i32 2, label %reduce.switch.atomic +! CHECK-NEXT: ] + +! CHECK: reduce.switch.atomic: ; preds = %omp.region.cont26 +! CHECK-NEXT: unreachable + +! CHECK: reduce.switch.nonatomic: ; preds = %omp.region.cont26 +! CHECK-NEXT: %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8 +! CHECK-NEXT: br label %omp.reduction.nonatomic.body + +! [various blocks implementing the reduction] + +! CHECK: omp.region.cont35: ; preds = +! CHECK-NEXT: %{{.*}} = phi ptr +! CHECK-NEXT: call void @__kmpc_end_reduce( +! CHECK-NEXT: br label %reduce.finalize + +! CHECK: reduce.finalize: ; preds = +! CHECK-NEXT: br label %omp.par.pre_finalize + +! CHECK: omp.par.pre_finalize: ; preds = %reduce.finalize +! CHECK-NEXT: %{{.*}} = load ptr, ptr +! CHECK-NEXT: br label %omp.reduction.cleanup + +! CHECK: omp.reduction.cleanup: ; preds = %omp.par.pre_finalize +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup41, label %omp.reduction.cleanup42 + +! CHECK: omp.reduction.cleanup42: ; preds = %omp.reduction.cleanup41, %omp.reduction.cleanup +! CHECK-NEXT: br label %omp.region.cont40 + +! CHECK: omp.region.cont40: ; preds = %omp.reduction.cleanup42 +! CHECK-NEXT: %{{.*}} = load ptr, ptr +! CHECK-NEXT: br label %omp.reduction.cleanup44 + +! CHECK: omp.reduction.cleanup44: ; preds = %omp.region.cont40 +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.reduction.cleanup45, label %omp.reduction.cleanup46 + +! CHECK: omp.reduction.cleanup46: ; preds = %omp.reduction.cleanup45, %omp.reduction.cleanup44 +! CHECK-NEXT: br label %omp.region.cont43 + +! CHECK: omp.region.cont43: ; preds = %omp.reduction.cleanup46 +! CHECK-NEXT: br label %omp.private.dealloc + +! CHECK: omp.private.dealloc: ; preds = %omp.region.cont43 +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.private.dealloc48, label %omp.private.dealloc49 + +! CHECK: omp.private.dealloc49: ; preds = %omp.private.dealloc48, %omp.private.dealloc +! CHECK-NEXT: br label %omp.region.cont47 + +! CHECK: omp.region.cont47: ; preds = %omp.private.dealloc49 +! CHECK-NEXT: br label %omp.private.dealloc51 + +! CHECK: omp.private.dealloc51: ; preds = %omp.region.cont47 +! [null check] +! CHECK: br i1 %{{.*}}, label %omp.private.dealloc52, label %omp.private.dealloc53 + +! CHECK: omp.private.dealloc53: ; preds = %omp.private.dealloc52, %omp.private.dealloc51 +! CHECK-NEXT: br label %omp.region.cont50 + +! CHECK: omp.region.cont50: ; preds = %omp.private.dealloc53 +! CHECK-NEXT: br label %omp.par.outlined.exit.exitStub + +! CHECK: omp.private.dealloc52: ; preds = %omp.private.dealloc51 +! [dealloc memory] +! CHECK: br label %omp.private.dealloc53 + +! CHECK: omp.private.dealloc48: ; preds = %omp.private.dealloc +! [dealloc memory] +! CHECK: br label %omp.private.dealloc49 + +! CHECK: omp.reduction.cleanup45: ; preds = %omp.reduction.cleanup44 +! CHECK-NEXT: call void @free( +! CHECK-NEXT: br label %omp.reduction.cleanup46 + +! CHECK: omp.reduction.cleanup41: ; preds = %omp.reduction.cleanup +! CHECK-NEXT: call void @free( +! CHECK-NEXT: br label %omp.reduction.cleanup42 + +! CHECK: omp.par.region28: ; preds = %omp.par.region27 +! CHECK-NEXT: call {} @_FortranAStopStatement + +! CHECK: omp.reduction.neutral23: ; preds = %omp.reduction.neutral22 +! [source length was zero: finish initializing array] +! CHECK: br label %omp.reduction.neutral25 + +! CHECK: omp.reduction.neutral18: ; preds = %omp.reduction.neutral +! [source length was zero: finish initializing array] +! CHECK: br label %omp.reduction.neutral20 + +! CHECK: omp.private.copy15: ; preds = %omp.private.copy14 +! [source length was non-zero: call assign runtime] +! CHECK: br label %omp.private.copy16 + +! CHECK: omp.private.copy11: ; preds = %omp.private.copy10 +! [source length was non-zero: call assign runtime] +! CHECK: br label %omp.private.copy12 + +! CHECK: omp.private.alloc1: ; preds = %omp.private.alloc +! [var extent was non-zero: malloc a private array] +! CHECK: br label %omp.private.alloc3 + +! CHECK: omp.private.alloc6: ; preds = %omp.private.alloc5 +! [var extent was non-zero: malloc a private array] +! CHECK: br label %omp.private.alloc8 + +! CHECK: omp.par.outlined.exit.exitStub: ; preds = %omp.region.cont50 +! CHECK-NEXT: ret void diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90 new file mode 100644 index 0000000000000000000000000000000000000000..62d0a3faf0c59384d483a9ec5dbb732208aec436 --- /dev/null +++ b/flang/test/Integration/OpenMP/private-global.f90 @@ -0,0 +1,46 @@ +!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s + +! Regression test for https://github.com/llvm/llvm-project/issues/106297 + +program bug + implicit none + integer :: table(10) + !$OMP PARALLEL PRIVATE(table) + table = 50 + if (any(table/=50)) then + stop 'fail 3' + end if + !$OMP END PARALLEL + print *,'ok' +End Program + + +! CHECK-LABEL: define internal void {{.*}}..omp_par( +! CHECK: omp.par.entry: +! CHECK: %[[VAL_9:.*]] = alloca i32, align 4 +! CHECK: %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4 +! CHECK: store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4 +! CHECK: %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4 +! CHECK: %[[PRIV_TABLE:.*]] = alloca [10 x i32], i64 1, align 4 +! ... +! check that we use the private copy of table for the assignment +! CHECK: omp.par.region1: +! CHECK: %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8 +! CHECK: %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8 +! CHECK: %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +! CHECK: %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 +! CHECK: %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8 +! CHECK: %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8 +! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8 +! CHECK: %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9) +! ... +! check that we use the private copy of table for table/=50 +! CHECK: omp.par.region3: +! CHECK: %[[VAL_44:.*]] = sub nsw i64 %{{.*}}, 1 +! CHECK: %[[VAL_45:.*]] = mul nsw i64 %[[VAL_44]], 1 +! CHECK: %[[VAL_46:.*]] = mul nsw i64 %[[VAL_45]], 1 +! CHECK: %[[VAL_47:.*]] = add nsw i64 %[[VAL_46]], 0 +! CHECK: %[[VAL_48:.*]] = getelementptr i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]] +! CHECK: %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4 +! CHECK: %[[VAL_50:.*]] = icmp ne i32 %[[VAL_49]], 50 diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90 index f574a1265e06c4a847999ca1f69125fcd6edd46d..d92f06cebfdbe5c4f6f496aca1ff35ca90bfcb6f 100644 --- a/flang/test/Lower/OpenMP/simd.f90 +++ b/flang/test/Lower/OpenMP/simd.f90 @@ -4,6 +4,8 @@ ! RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s ! RUN: bbc -hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s +!CHECK: omp.declare_reduction @[[REDUCER:.*]] : i32 + !CHECK-LABEL: func @_QPsimd() subroutine simd integer :: i @@ -273,3 +275,25 @@ subroutine lastprivate_with_simd sum = i + 1 end do end subroutine + +!CHECK-LABEL: func @_QPsimd_with_reduction_clause() +subroutine simd_with_reduction_clause + integer :: i, x + x = 0 + ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 + ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32 + ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32 + ! CHECK-NEXT: omp.simd reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + !$omp simd reduction(+:x) + do i=1, 9 + ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_RED]] {uniq_name = "_QFsimd_with_reduction_clauseEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref + ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref + ! CHECK: %[[I_LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref + ! CHECK: %[[SUM:.*]] = arith.addi %[[X_LD]], %[[I_LD]] : i32 + ! CHECK: hlfir.assign %[[SUM]] to %[[X_DECL]]#0 : i32, !fir.ref + x = x+i + end do + !$OMP end simd +end subroutine diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90 index 9de257b00dc32fa81d698c3e85e4a239fb728fba..bf413d591baf23cbd3a91ebcb977f8582a86e776 100644 --- a/flang/test/Parser/OpenMP/allocate-tree.f90 +++ b/flang/test/Parser/OpenMP/allocate-tree.f90 @@ -18,6 +18,19 @@ program allocate_tree allocate(w, xarray(4), zarray(t, z)) end program allocate_tree +!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!CHECK-NEXT: | | | AttrSpec -> Allocatable +!CHECK-NEXT: | | | EntityDecl +!CHECK-NEXT: | | | | Name = 'w' +!CHECK-NEXT: | | | EntityDecl +!CHECK-NEXT: | | | | Name = 'xarray' +!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1' +!CHECK-NEXT: | | | EntityDecl +!CHECK-NEXT: | | | | Name = 'zarray' +!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2' + + !CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate !CHECK-NEXT: | | | Verbatim !CHECK-NEXT: | | | OmpClauseList -> diff --git a/flang/test/Parser/continuation-in-conditional-compilation.f b/flang/test/Parser/continuation-in-conditional-compilation.f index 35eecbc0f16ea4515d279f2e10d66fa9ee0cf3f8..987112301e335c1311e2e4e0dbbd4da6e8ce9652 100644 --- a/flang/test/Parser/continuation-in-conditional-compilation.f +++ b/flang/test/Parser/continuation-in-conditional-compilation.f @@ -1,6 +1,6 @@ ! RUN: %flang_fc1 -fopenmp -fopenacc -E %s 2>&1 | FileCheck %s program main -! CHECK: k01=1+1 +! CHECK: k01=1+ 1 k01=1+ !$ & 1 diff --git a/flang/test/Preprocessing/pp029.F b/flang/test/Preprocessing/pp029.F index 4ca87dd20f157a595edea38cd3c2b4072c653259..1f8533ab08cd6c95fbb3efd1cad7d2c3616d6609 100644 --- a/flang/test/Preprocessing/pp029.F +++ b/flang/test/Preprocessing/pp029.F @@ -1,5 +1,5 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: if (777 .eq. 777) then +! CHECK: if (77 7.eq. 777) then * \ newline allowed in #define integer, parameter :: KWM = 666 #define KWM 77\ diff --git a/flang/test/Preprocessing/pp031.F b/flang/test/Preprocessing/pp031.F index 4813c40208a9fed20e45d7031fd59cf988a5dd77..3ad0bde9e50c090f10d68ab5857bfdeccdf3b591 100644 --- a/flang/test/Preprocessing/pp031.F +++ b/flang/test/Preprocessing/pp031.F @@ -1,6 +1,6 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: if (777//Ccomment.eq.777)then -! CHECK: print *, 'pp031.F no: ', 777//Ccomment +! CHECK: if (777 // C comment.eq. 777) then +! CHECK: print *, 'pp031.F no: ', 777 // C comment * // C++ comment NOT erased from #define integer, parameter :: KWM = 666 #define KWM 777 // C comment diff --git a/flang/test/Preprocessing/pp041.F b/flang/test/Preprocessing/pp041.F index 3f1f3c6a2aeba210e48027c9475221693072270d..cee3c5d3e490b55efd7ed21be1f8bf64fc8ad866 100644 --- a/flang/test/Preprocessing/pp041.F +++ b/flang/test/Preprocessing/pp041.F @@ -1,5 +1,5 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: j = 666WMj=j+1WM211 +! CHECK: j = 666WMj= j+ 1WM211 * use KWM expansion as continuation indicators #define KWM 0 #define KWM2 1 diff --git a/flang/test/Preprocessing/renaming.F b/flang/test/Preprocessing/renaming.F index 1bef18116901e78b0aab420e0536e98c243da043..c39ab6fb029a05f8cb588ac02153be39bafb8f7f 100644 --- a/flang/test/Preprocessing/renaming.F +++ b/flang/test/Preprocessing/renaming.F @@ -1,5 +1,5 @@ ! RUN: %flang -E %s | FileCheck %s -! CHECK: ((1)*10000+(11)*100) +! CHECK: ((1) * 10000 + (11) * 100) ! Ensure that a keyword-like macro can be used to rename a ! function-like macro. #define TO_VERSION2(MAJOR, MINOR) ((MAJOR) * 10000 + (MINOR) * 100) diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90 index 24540492e732714bd74894b6bc502827858eef5a..1a7a57b124e9bda42a58c8e7f37a6f111f2e3b38 100644 --- a/flang/test/Semantics/OpenMP/clause-validity01.f90 +++ b/flang/test/Semantics/OpenMP/clause-validity01.f90 @@ -390,6 +390,12 @@ use omp_lib enddo !$omp end parallel + !ERROR: The `SAFELEN` clause cannot appear in the `SIMD` directive with `ORDER(CONCURRENT)` clause + !$omp simd order(concurrent) safelen(1+2) + do i = 1, N + a = 3.14 + enddo + ! 2.11.1 parallel-do-clause -> parallel-clause | ! do-clause diff --git a/flang/test/Semantics/OpenMP/declarative-directive.f90 b/flang/test/Semantics/OpenMP/declarative-directive01.f90 similarity index 100% rename from flang/test/Semantics/OpenMP/declarative-directive.f90 rename to flang/test/Semantics/OpenMP/declarative-directive01.f90 diff --git a/flang/test/Semantics/OpenMP/declarative-directive02.f90 b/flang/test/Semantics/OpenMP/declarative-directive02.f90 new file mode 100644 index 0000000000000000000000000000000000000000..dcde963689eb0dd1b9bd35e95c6c840be05312f5 --- /dev/null +++ b/flang/test/Semantics/OpenMP/declarative-directive02.f90 @@ -0,0 +1,56 @@ +! RUN: %flang -fsyntax-only -fopenmp %s 2>&1 + +! Check that OpenMP declarative directives can be used with objects that have +! an incomplete type. + +subroutine test_decl + ! OMPv5.2 5.2 threadprivate + ! OMPv5.2 6.5 allocate + implicit none + save :: x1, y1 + !$omp threadprivate(x1) + !$omp allocate(y1) + integer :: x1, y1 + + ! OMPv5.2 7.7 declare-simd + external :: simd_func + !$omp declare simd(simd_func) + logical :: simd_func + + ! OMPv5.2 7.8.1 declare-target + allocatable :: j + !$omp declare target(j) + save :: j + real(kind=8) :: j(:) + + ! OMPv5.2 5.5.11 declare-reduction - crashes + !external :: my_add_red + !!$omp declare reduction(my_add_red : integer : my_add_red(omp_out, omp_in)) & + !!$omp& initializer(omp_priv=0) + !integer :: my_add_red +end subroutine + +subroutine test_decl2 + save x1, y1 + !$omp threadprivate(x1) + !$omp allocate(y1) + integer :: x1, y1 + + ! implicit decl + !$omp threadprivate(x2) + !$omp allocate(y2) + save x2, y2 +end subroutine + +module m1 + ! implicit decl + !$omp threadprivate(x, y, z) + integer :: y + real :: z + +contains + subroutine sub + !$omp parallel copyin(x, y, z) + !$omp end parallel + end subroutine +end module diff --git a/flang/test/Semantics/OpenMP/declare-target06.f90 b/flang/test/Semantics/OpenMP/declare-target06.f90 index 9abcfcecb681ab51a93d8156593af22c5c3ef58a..7df0a73123094b09c8dc88a7dbe5797d79715d1d 100644 --- a/flang/test/Semantics/OpenMP/declare-target06.f90 +++ b/flang/test/Semantics/OpenMP/declare-target06.f90 @@ -6,21 +6,16 @@ module test_0 implicit none -!ERROR: The given DECLARE TARGET directive clause has an invalid argument !ERROR: No explicit type declared for 'no_implicit_materialization_1' !$omp declare target(no_implicit_materialization_1) -!ERROR: The given DECLARE TARGET directive clause has an invalid argument !ERROR: No explicit type declared for 'no_implicit_materialization_2' !$omp declare target link(no_implicit_materialization_2) -!ERROR: The given DECLARE TARGET directive clause has an invalid argument !WARNING: The usage of TO clause on DECLARE TARGET directive has been deprecated. Use ENTER clause instead. !ERROR: No explicit type declared for 'no_implicit_materialization_3' !$omp declare target to(no_implicit_materialization_3) -!ERROR: The given DECLARE TARGET directive clause has an invalid argument -!ERROR: No explicit type declared for 'no_implicit_materialization_3' !$omp declare target enter(no_implicit_materialization_3) INTEGER :: data_int = 10 diff --git a/flang/test/Semantics/OpenMP/do-collapse.f90 b/flang/test/Semantics/OpenMP/do-collapse.f90 index 4f2512937ace4e00e92562e048d4461f0fd7256f..480bd45b79b8391b1b0ce597112bb91cea9ef039 100644 --- a/flang/test/Semantics/OpenMP/do-collapse.f90 +++ b/flang/test/Semantics/OpenMP/do-collapse.f90 @@ -30,5 +30,11 @@ program omp_doCollapse do end do end do -end program omp_doCollapse + !ERROR: At most one COLLAPSE clause can appear on the SIMD directive + !$omp simd collapse(2) collapse(1) + do i = 1, 4 + j = j + i + 1 + end do + !$omp end simd +end program omp_doCollapse diff --git a/flang/test/Semantics/OpenMP/loop-association.f90 b/flang/test/Semantics/OpenMP/loop-association.f90 index d2167663c5ddeae37e5695fb0c350cc5e75331b8..9fac508e6128a7dfe589e1d8c7ab40955c7d309c 100644 --- a/flang/test/Semantics/OpenMP/loop-association.f90 +++ b/flang/test/Semantics/OpenMP/loop-association.f90 @@ -131,4 +131,10 @@ !$omp end parallel do simd !ERROR: The END PARALLEL DO SIMD directive must follow the DO loop associated with the loop construct !$omp end parallel do simd + + !ERROR: A DO loop must follow the SIMD directive + !$omp simd + a = i + 1 + !ERROR: The END SIMD directive must follow the DO loop associated with the loop construct + !$omp end simd end diff --git a/flang/test/Semantics/OpenMP/workshare02.f90 b/flang/test/Semantics/OpenMP/workshare02.f90 index 11f33d63a3eb800b51900d468f21b149b9ef1f1e..dddaa354fff9faa89bdba20fe1b8fb2a2417c072 100644 --- a/flang/test/Semantics/OpenMP/workshare02.f90 +++ b/flang/test/Semantics/OpenMP/workshare02.f90 @@ -9,6 +9,14 @@ module my_mod integer function my_func() my_func = 10 end function my_func + + impure integer function impure_my_func() + impure_my_func = 20 + end function impure_my_func + + impure elemental integer function impure_ele_my_func() + impure_ele_my_func = 20 + end function impure_ele_my_func end module my_mod subroutine workshare(aa, bb, cc, dd, ee, ff, n) @@ -61,6 +69,16 @@ subroutine workshare(aa, bb, cc, dd, ee, ff, n) j = j - my_func() !$omp end atomic + !ERROR: User defined IMPURE, non-ELEMENTAL function 'impure_my_func' is not allowed in a WORKSHARE construct + cc = impure_my_func() + !ERROR: User defined IMPURE function 'impure_ele_my_func' is not allowed in a WORKSHARE construct + aa(1) = impure_ele_my_func() + !$omp end workshare + !$omp workshare + j = j + 1 + !ERROR: At most one NOWAIT clause can appear on the END WORKSHARE directive + !$omp end workshare nowait nowait + end subroutine workshare diff --git a/flang/test/Semantics/c_f_pointer.f90 b/flang/test/Semantics/c_f_pointer.f90 index c2529201ee26597558e9280986bdeb4d85b496de..0cd0161b1fb0064ddce3b2368a90197a25bbaf75 100644 --- a/flang/test/Semantics/c_f_pointer.f90 +++ b/flang/test/Semantics/c_f_pointer.f90 @@ -18,6 +18,7 @@ program test end type type(notBindCType), pointer :: notBindC character(2), pointer :: c2ptr + character(1,4), pointer :: unicodePtr rankTwoArray = reshape([1, 2, 3, 4], shape(rankTwoArray)) call c_f_pointer(scalarC, scalarIntF) ! ok call c_f_pointer(scalarC, arrayIntF, [1_8]) ! ok @@ -48,6 +49,8 @@ program test call c_f_pointer(scalarC, unlimited) !WARNING: FPTR= argument to C_F_POINTER() should not have a derived type that is not BIND(C) call c_f_pointer(scalarC, notBindC) - !WARNING: FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type CHARACTER(KIND=1,LEN=2_8) + !WARNING: FPTR= argument to C_F_POINTER() should not have the non-interoperable character length CHARACTER(KIND=1,LEN=2_8) call c_f_pointer(scalarC, c2ptr) + !WARNING: FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type or kind CHARACTER(KIND=4,LEN=1_8) + call c_f_pointer(scalarC, unicodePtr) end program diff --git a/flang/test/Semantics/c_loc01.f90 b/flang/test/Semantics/c_loc01.f90 index 9155ff4f47354df66e3584b6d76cb8cd5ac68d49..abae1e263e2e216b4a2b0b9d6189853ddf5df85f 100644 --- a/flang/test/Semantics/c_loc01.f90 +++ b/flang/test/Semantics/c_loc01.f90 @@ -21,6 +21,7 @@ module m type(hasLen(*)), target :: nclen integer, intent(in) :: n character(2), target :: ch + character(1,4), target :: unicode real :: arr1(purefun1(c_loc(targ))) ! ok real :: arr2(purefun2(c_funloc(subr))) ! ok character(:), allocatable, target :: deferred @@ -40,8 +41,10 @@ module m cp = c_loc(nclen) !ERROR: C_LOC() argument may not be zero-length character cp = c_loc(ch(2:1)) - !WARNING: C_LOC() argument has non-interoperable intrinsic type, kind, or length + !WARNING: C_LOC() argument has non-interoperable character length cp = c_loc(ch) + !WARNING: C_LOC() argument has non-interoperable intrinsic type or kind + cp = c_loc(unicode) cp = c_loc(ch(1:1)) ! ok cp = c_loc(deferred) ! ok cp = c_loc(p2ch) ! ok diff --git a/flang/test/Semantics/rewrite02.f90 b/flang/test/Semantics/rewrite02.f90 new file mode 100644 index 0000000000000000000000000000000000000000..2393498e65d2919d534b3dd830bd7bcf4554ace5 --- /dev/null +++ b/flang/test/Semantics/rewrite02.f90 @@ -0,0 +1,8 @@ +!RUN: %flang_fc1 -fdebug-unparse -pedantic %s 2>&1 | FileCheck %s +!Test rewrite of "PRINT namelistname" into "WRITE(*,NML=namelistname)" +!CHECK: nonstandard: namelist in PRINT statement +namelist /nml/x +x = 123. +!CHECK: WRITE (*, NML=nml) +print nml +end diff --git a/flang/test/Semantics/smp-def01.f90 b/flang/test/Semantics/smp-def01.f90 new file mode 100644 index 0000000000000000000000000000000000000000..7169bba4509990f0b4519b286eef71da7b438485 --- /dev/null +++ b/flang/test/Semantics/smp-def01.f90 @@ -0,0 +1,23 @@ +!RUN: %flang -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s +!Ensure no bogus error message about incompatible character length +!CHECK-NOT: error + +module m1 + integer :: n = 1 +end + +module m2 + interface + module subroutine s(a,b) + use m1 + character(n) :: a + character(n) :: b + end + end interface +end + +submodule(m2) m2s1 + contains + module procedure s + end +end diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp index f75fba27c68f089df8fb8891338b42c96032bbae..84a74770cf030306f0fc21c1bf761d35609cae70 100644 --- a/flang/tools/fir-opt/fir-opt.cpp +++ b/flang/tools/fir-opt/fir-opt.cpp @@ -42,6 +42,7 @@ int main(int argc, char **argv) { #endif DialectRegistry registry; fir::support::registerDialects(registry); + registry.insert(); fir::support::addFIRExtensions(registry); return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n", registry)); diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake index 7711127c1a81e14f98d8b00f3004f19722c33528..1e5ed723194a268d4698163faf8b78a1a4d680e4 100644 --- a/libc/cmake/modules/LLVMLibCArchitectures.cmake +++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake @@ -84,7 +84,7 @@ if(NOT (libc_compiler_info_result EQUAL "0")) message(FATAL_ERROR "libc build: error querying compiler info from the " "compiler: ${libc_compiler_info}") endif() -string(REGEX MATCH "Target: [-_a-z0-9.]+[ \r\n]+" +string(REGEX MATCH "Target: [-_a-zA-Z0-9.]+[ \r\n]+" libc_compiler_target_info ${libc_compiler_info}) if(NOT libc_compiler_target_info) message(FATAL_ERROR "libc build: could not read compiler target info from:\n" diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index b4cfe47f4505fd0ea5cd63afa037f14696ad3e3f..4bb81f5d3b2de171468bad8bb7c9ef0029df02de 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -521,7 +521,9 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.canonicalizef16 libc.src.math.ceilf16 libc.src.math.copysignf16 + libc.src.math.coshf16 libc.src.math.exp10f16 + libc.src.math.exp10m1f16 libc.src.math.exp2f16 libc.src.math.expf16 libc.src.math.f16add @@ -584,6 +586,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.scalbnf16 libc.src.math.setpayloadf16 libc.src.math.setpayloadsigf16 + libc.src.math.sinhf16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 libc.src.math.truncf16 diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 32c0d199489317d96e1ee6cc904524a8568a2fd8..885827d304efe3f4815c3e78a96d4e1fdedb3f6b 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -679,6 +679,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.scalbnf16 libc.src.math.setpayloadf16 libc.src.math.setpayloadsigf16 + libc.src.math.sinpif16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 libc.src.math.truncf16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 1cd817171de4be4ac51d42a5a3f85a9bff44228a..39f451d6b5fc0eed44ed35245f4e6bf28828ca5c 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -610,7 +610,9 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.canonicalizef16 libc.src.math.ceilf16 libc.src.math.copysignf16 + libc.src.math.coshf16 libc.src.math.exp10f16 + libc.src.math.exp10m1f16 libc.src.math.exp2f16 libc.src.math.exp2m1f16 libc.src.math.expf16 @@ -677,6 +679,8 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.scalbnf16 libc.src.math.setpayloadf16 libc.src.math.setpayloadsigf16 + libc.src.math.sinhf16 + libc.src.math.sinpif16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 libc.src.math.truncf16 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 806dcc64bb93962ab4b7222e92f356715ea4c5a5..902645c9e00178ffd813124cc06d3563c170d281 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -278,7 +278,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | cos | |check| | |check| | | | | 7.12.4.5 | F.10.1.5 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| cosh | |check| | | | | | 7.12.5.4 | F.10.2.4 | +| cosh | |check| | | | |check| | | 7.12.5.4 | F.10.2.4 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | cospi | |check| | | | | | 7.12.4.12 | F.10.1.12 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ @@ -292,7 +292,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | exp10 | |check| | |check| | | |check| | | 7.12.6.2 | F.10.3.2 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| exp10m1 | | | | | | 7.12.6.3 | F.10.3.3 | +| exp10m1 | | | | |check| | | 7.12.6.3 | F.10.3.3 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | exp2 | |check| | |check| | | |check| | | 7.12.6.4 | F.10.3.4 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ @@ -340,9 +340,9 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | sincos | |check| | |check| | | | | | | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| sinh | |check| | | | | | 7.12.5.5 | F.10.2.5 | +| sinh | |check| | | | |check| | | 7.12.5.5 | F.10.2.5 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| sinpi | |check| | | | | | 7.12.4.13 | F.10.1.13 | +| sinpi | |check| | | | |check| | | 7.12.4.13 | F.10.1.13 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | sqrt | |check| | |check| | |check| | | |check| | 7.12.7.10 | F.10.4.10 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml index d8b810b542cb9f3d8a2a602a143f2c95718b8357..98ea1a0d25fbb7747b78c41c97e8d306d75f389e 100644 --- a/libc/newhdrgen/yaml/math.yaml +++ b/libc/newhdrgen/yaml/math.yaml @@ -2297,6 +2297,13 @@ functions: return_type: float arguments: - type: float + - name: sinpif16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: sqrt standards: - stdc diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 1b255690c2e453eb04b0ad5d46a48dbed4d2fbb9..e4e46e7e13a586c2886af1fd82208f04d96334b6 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -692,6 +692,8 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"exp10f", RetValSpec, [ArgSpec]>, GuardedFunctionSpec<"exp10f16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + GuardedFunctionSpec<"exp10m1f16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + FunctionSpec<"remainder", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"remainderf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"remainderl", RetValSpec, [ArgSpec, ArgSpec]>, @@ -790,7 +792,11 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"pow", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"coshf", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"coshf16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + FunctionSpec<"sinhf", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"sinhf16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + FunctionSpec<"tanhf", RetValSpec, [ArgSpec]>, FunctionSpec<"acosf", RetValSpec, [ArgSpec]>, diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index 01273e16a9387a271f4493d8fd2de769a5d3d029..f98e610104797fda51d9b85cfa984dafeb9050ae 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -18,17 +18,18 @@ namespace { void *rpc_allocate(uint64_t size) { void *ptr = nullptr; rpc::Client::Port port = rpc::client.open(); - port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; }, - [&](rpc::Buffer *buffer) { - ptr = reinterpret_cast(buffer->data[0]); - }); + port.send_and_recv( + [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; }, + [&](rpc::Buffer *buffer, uint32_t) { + ptr = reinterpret_cast(buffer->data[0]); + }); port.close(); return ptr; } void rpc_free(void *ptr) { rpc::Client::Port port = rpc::client.open(); - port.send([=](rpc::Buffer *buffer) { + port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = reinterpret_cast(ptr); }); port.close(); diff --git a/libc/src/__support/OSUtil/gpu/exit.cpp b/libc/src/__support/OSUtil/gpu/exit.cpp index 360bcca1c6da3359546ea82f04729793dfa0dc30..8aaa41b4e3eefc55b117b294105c512aa1ba0398 100644 --- a/libc/src/__support/OSUtil/gpu/exit.cpp +++ b/libc/src/__support/OSUtil/gpu/exit.cpp @@ -18,8 +18,9 @@ namespace internal { [[noreturn]] void exit(int status) { // We want to first make sure the server is listening before we exit. rpc::Client::Port port = rpc::client.open(); - port.send_and_recv([](rpc::Buffer *) {}, [](rpc::Buffer *) {}); - port.send([&](rpc::Buffer *buffer) { + port.send_and_recv([](rpc::Buffer *, uint32_t) {}, + [](rpc::Buffer *, uint32_t) {}); + port.send([&](rpc::Buffer *buffer, uint32_t) { reinterpret_cast(buffer->data)[0] = status; }); port.close(); diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp index f3000bd0f48b169d2cadcf582dc574646f993079..f70c2e798cfe158ab8980acd5fb12acbcc43b760 100644 --- a/libc/src/__support/OSUtil/gpu/io.cpp +++ b/libc/src/__support/OSUtil/gpu/io.cpp @@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL { void write_to_stderr(cpp::string_view msg) { rpc::Client::Port port = rpc::client.open(); port.send_n(msg.data(), msg.size()); - port.recv([](rpc::Buffer *) { /* void */ }); + port.recv([](rpc::Buffer *, uint32_t) { /* void */ }); port.close(); } diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index a94b11902c11905c3e54100e10ca49b7811180a4..c421dd82b2945031d696b0f6dcfa5b9d15d01686 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -21,7 +21,6 @@ #include "rpc_util.h" #include "src/__support/CPP/algorithm.h" // max #include "src/__support/CPP/atomic.h" -#include "src/__support/CPP/functional.h" #include "src/__support/CPP/optional.h" #include "src/__support/GPU/utils.h" #include "src/__support/macros/config.h" @@ -266,22 +265,9 @@ template struct Process { }; /// Invokes a function accross every active buffer across the total lane size. -static LIBC_INLINE void invoke_rpc(cpp::function fn, - uint32_t lane_size, uint64_t lane_mask, - Buffer *slot) { - if constexpr (is_process_gpu()) { - fn(&slot[gpu::get_lane_id()]); - } else { - for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size()) - if (lane_mask & (1ul << i)) - fn(&slot[i]); - } -} - -/// Alternate version that also provides the index of the current lane. -static LIBC_INLINE void invoke_rpc(cpp::function fn, - uint32_t lane_size, uint64_t lane_mask, - Buffer *slot) { +template +LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size, + uint64_t lane_mask, Buffer *slot) { if constexpr (is_process_gpu()) { fn(&slot[gpu::get_lane_id()], gpu::get_lane_id()); } else { @@ -444,7 +430,7 @@ template template LIBC_INLINE void Port::recv_and_send(W work) { recv(work); - send([](Buffer *) { /* no-op */ }); + send([](Buffer *, uint32_t) { /* no-op */ }); } /// Helper routine to simplify the interface when sending from the GPU using diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h index 681782d57319e5e1b7b7875e43214419596cbcce..246b89f08f2ff95f0af5ba342a23bb290c89f45e 100644 --- a/libc/src/__support/big_int.h +++ b/libc/src/__support/big_int.h @@ -14,7 +14,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/optional.h" #include "src/__support/CPP/type_traits.h" -#include "src/__support/macros/attributes.h" // LIBC_INLINE +#include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG @@ -361,17 +361,94 @@ public: LIBC_INLINE constexpr BigInt(const BigInt &other) = default; - template + template LIBC_INLINE constexpr BigInt( - const BigInt &other) { - if (OtherBits >= Bits) { // truncate - for (size_t i = 0; i < WORD_COUNT; ++i) - val[i] = other[i]; - } else { // zero or sign extend - size_t i = 0; - for (; i < OtherBits / WORD_SIZE; ++i) - val[i] = other[i]; - extend(i, Signed && other.is_neg()); + const BigInt &other) { + using BigIntOther = BigInt; + const bool should_sign_extend = Signed && other.is_neg(); + + static_assert(!(Bits == OtherBits && WORD_SIZE != BigIntOther::WORD_SIZE) && + "This is currently untested for casting between bigints with " + "the same bit width but different word sizes."); + + if constexpr (BigIntOther::WORD_SIZE < WORD_SIZE) { + // OtherWordType is smaller + constexpr size_t WORD_SIZE_RATIO = WORD_SIZE / BigIntOther::WORD_SIZE; + static_assert( + (WORD_SIZE % BigIntOther::WORD_SIZE) == 0 && + "Word types must be multiples of each other for correct conversion."); + if constexpr (OtherBits >= Bits) { // truncate + // for each big word + for (size_t i = 0; i < WORD_COUNT; ++i) { + WordType cur_word = 0; + // combine WORD_SIZE_RATIO small words into a big word + for (size_t j = 0; j < WORD_SIZE_RATIO; ++j) + cur_word |= static_cast(other[(i * WORD_SIZE_RATIO) + j]) + << (BigIntOther::WORD_SIZE * j); + + val[i] = cur_word; + } + } else { // zero or sign extend + size_t i = 0; + WordType cur_word = 0; + // for each small word + for (; i < BigIntOther::WORD_COUNT; ++i) { + // combine WORD_SIZE_RATIO small words into a big word + cur_word |= static_cast(other[i]) + << (BigIntOther::WORD_SIZE * (i % WORD_SIZE_RATIO)); + // if we've completed a big word, copy it into place and reset + if ((i % WORD_SIZE_RATIO) == WORD_SIZE_RATIO - 1) { + val[i / WORD_SIZE_RATIO] = cur_word; + cur_word = 0; + } + } + // Pretend there are extra words of the correct sign extension as needed + + const WordType extension_bits = + should_sign_extend ? cpp::numeric_limits::max() + : cpp::numeric_limits::min(); + if ((i % WORD_SIZE_RATIO) != 0) { + cur_word |= static_cast(extension_bits) + << (BigIntOther::WORD_SIZE * (i % WORD_SIZE_RATIO)); + } + // Copy the last word into place. + val[(i / WORD_SIZE_RATIO)] = cur_word; + extend((i / WORD_SIZE_RATIO) + 1, should_sign_extend); + } + } else if constexpr (BigIntOther::WORD_SIZE == WORD_SIZE) { + if constexpr (OtherBits >= Bits) { // truncate + for (size_t i = 0; i < WORD_COUNT; ++i) + val[i] = other[i]; + } else { // zero or sign extend + size_t i = 0; + for (; i < BigIntOther::WORD_COUNT; ++i) + val[i] = other[i]; + extend(i, should_sign_extend); + } + } else { + // OtherWordType is bigger. + constexpr size_t WORD_SIZE_RATIO = BigIntOther::WORD_SIZE / WORD_SIZE; + static_assert( + (BigIntOther::WORD_SIZE % WORD_SIZE) == 0 && + "Word types must be multiples of each other for correct conversion."); + if constexpr (OtherBits >= Bits) { // truncate + // for each small word + for (size_t i = 0; i < WORD_COUNT; ++i) { + // split each big word into WORD_SIZE_RATIO small words + val[i] = static_cast(other[i / WORD_SIZE_RATIO] >> + ((i % WORD_SIZE_RATIO) * WORD_SIZE)); + } + } else { // zero or sign extend + size_t i = 0; + // for each big word + for (; i < BigIntOther::WORD_COUNT; ++i) { + // split each big word into WORD_SIZE_RATIO small words + for (size_t j = 0; j < WORD_SIZE_RATIO; ++j) + val[(i * WORD_SIZE_RATIO) + j] = + static_cast(other[i] >> (j * WORD_SIZE)); + } + extend(i * WORD_SIZE_RATIO, should_sign_extend); + } } } diff --git a/libc/src/gpu/rpc_host_call.cpp b/libc/src/gpu/rpc_host_call.cpp index f21fadc319c61580caa0766b8dc88857e326dad4..1181e9554d16e23c4cc8732fd54575bceb531df1 100644 --- a/libc/src/gpu/rpc_host_call.cpp +++ b/libc/src/gpu/rpc_host_call.cpp @@ -21,11 +21,11 @@ LLVM_LIBC_FUNCTION(unsigned long long, rpc_host_call, (void *fn, void *data, size_t size)) { rpc::Client::Port port = rpc::client.open(); port.send_n(data, size); - port.send([=](rpc::Buffer *buffer) { + port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = reinterpret_cast(fn); }); unsigned long long ret; - port.recv([&](rpc::Buffer *buffer) { + port.recv([&](rpc::Buffer *buffer, uint32_t) { ret = static_cast(buffer->data[0]); }); port.close(); diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 53907e47323e9e2e55d22779986c59b4a6f60aff..2f76b57d19e99d28d0f42dd9733679733b861cae 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -89,8 +89,11 @@ add_math_entrypoint_object(copysignf128) add_math_entrypoint_object(cos) add_math_entrypoint_object(cosf) + add_math_entrypoint_object(cosh) add_math_entrypoint_object(coshf) +add_math_entrypoint_object(coshf16) + add_math_entrypoint_object(cospif) add_math_entrypoint_object(daddl) @@ -127,6 +130,8 @@ add_math_entrypoint_object(exp10) add_math_entrypoint_object(exp10f) add_math_entrypoint_object(exp10f16) +add_math_entrypoint_object(exp10m1f16) + add_math_entrypoint_object(expm1) add_math_entrypoint_object(expm1f) add_math_entrypoint_object(expm1f16) @@ -475,9 +480,11 @@ add_math_entrypoint_object(sincosf) add_math_entrypoint_object(sin) add_math_entrypoint_object(sinf) add_math_entrypoint_object(sinpif) +add_math_entrypoint_object(sinpif16) add_math_entrypoint_object(sinh) add_math_entrypoint_object(sinhf) +add_math_entrypoint_object(sinhf16) add_math_entrypoint_object(sqrt) add_math_entrypoint_object(sqrtf) diff --git a/libc/src/math/coshf16.h b/libc/src/math/coshf16.h new file mode 100644 index 0000000000000000000000000000000000000000..55c9d4941d4ae11c61de76cd026cbc643060b416 --- /dev/null +++ b/libc/src/math/coshf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for coshf16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_COSHF16_H +#define LLVM_LIBC_SRC_MATH_COSHF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 coshf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_COSHF16_H diff --git a/libc/src/math/exp10m1f16.h b/libc/src/math/exp10m1f16.h new file mode 100644 index 0000000000000000000000000000000000000000..e195bc431f2e140f038091ca6f41d59b3826af70 --- /dev/null +++ b/libc/src/math/exp10m1f16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for exp10m1f16 --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_EXP10M1F16_H +#define LLVM_LIBC_SRC_MATH_EXP10M1F16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 exp10m1f16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_EXP10M1F16_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index b58935abdf056d0709733b414ff9b439139996bf..4a3de8f0400d6296d114b89d6f01b0d7e41630c0 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -528,6 +528,25 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + sinpif16 + SRCS + sinpif16.cpp + HDRS + ../sinpif16.h + DEPENDS + libc.src.__support.common + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval + libc.src.__support.macros.properties.types + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( tan SRCS @@ -1637,6 +1656,29 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + exp10m1f16 + SRCS + exp10m1f16.cpp + HDRS + ../exp10m1f16.h + DEPENDS + .expxf16 + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.cpu_features + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( expm1 SRCS @@ -4176,6 +4218,25 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + coshf16 + SRCS + coshf16.cpp + HDRS + ../coshf16.h + DEPENDS + .expxf16 + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.optimization + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( sinhf SRCS @@ -4191,6 +4252,25 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + sinhf16 + SRCS + sinhf16.cpp + HDRS + ../sinhf16.h + DEPENDS + .expxf16 + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.optimization + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( tanhf SRCS @@ -5255,6 +5335,7 @@ add_header_library( expxf16.h DEPENDS libc.src.__support.CPP.array + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.nearest_integer diff --git a/libc/src/math/generic/coshf16.cpp b/libc/src/math/generic/coshf16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cca7581c70e0e3603c91e72ae97290ce8460da18 --- /dev/null +++ b/libc/src/math/generic/coshf16.cpp @@ -0,0 +1,103 @@ +//===-- Half-precision cosh(x) function -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/coshf16.h" +#include "expxf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { + +static constexpr fputil::ExceptValues COSHF16_EXCEPTS_POS = {{ + // x = 0x1.6ap-5, coshf16(x) = 0x1p+0 (RZ) + {0x29a8U, 0x3c00U, 1U, 0U, 1U}, + // x = 0x1.8c4p+0, coshf16(x) = 0x1.3a8p+1 (RZ) + {0x3e31U, 0x40eaU, 1U, 0U, 0U}, + // x = 0x1.994p+0, coshf16(x) = 0x1.498p+1 (RZ) + {0x3e65U, 0x4126U, 1U, 0U, 0U}, + // x = 0x1.b6p+0, coshf16(x) = 0x1.6d8p+1 (RZ) + {0x3ed8U, 0x41b6U, 1U, 0U, 1U}, + // x = 0x1.aap+1, coshf16(x) = 0x1.be8p+3 (RZ) + {0x42a8U, 0x4afaU, 1U, 0U, 1U}, + // x = 0x1.cc4p+1, coshf16(x) = 0x1.23cp+4 (RZ) + {0x4331U, 0x4c8fU, 1U, 0U, 0U}, + // x = 0x1.288p+2, coshf16(x) = 0x1.9b4p+5 (RZ) + {0x44a2U, 0x526dU, 1U, 0U, 0U}, + // x = 0x1.958p+2, coshf16(x) = 0x1.1a4p+8 (RZ) + {0x4656U, 0x5c69U, 1U, 0U, 0U}, + // x = 0x1.5fp+3, coshf16(x) = 0x1.c54p+14 (RZ) + {0x497cU, 0x7715U, 1U, 0U, 1U}, +}}; + +static constexpr fputil::ExceptValues COSHF16_EXCEPTS_NEG = {{ + // x = -0x1.6ap-5, coshf16(x) = 0x1p+0 (RZ) + {0xa9a8U, 0x3c00U, 1U, 0U, 1U}, + // x = -0x1.b6p+0, coshf16(x) = 0x1.6d8p+1 (RZ) + {0xbed8U, 0x41b6U, 1U, 0U, 1U}, + // x = -0x1.288p+2, coshf16(x) = 0x1.9b4p+5 (RZ) + {0xc4a2U, 0x526dU, 1U, 0U, 0U}, + // x = -0x1.5fp+3, coshf16(x) = 0x1.c54p+14 (RZ) + {0xc97cU, 0x7715U, 1U, 0U, 1U}, +}}; + +LLVM_LIBC_FUNCTION(float16, coshf16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + uint16_t x_abs = x_u & 0x7fffU; + + // When |x| >= acosh(2^16), or x is NaN. + if (LIBC_UNLIKELY(x_abs >= 0x49e5U)) { + // cosh(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // When |x| >= acosh(2^16). + if (x_abs >= 0x49e5U) { + // cosh(+/-inf) = +inf + if (x_bits.is_inf()) + return FPBits::inf().get_val(); + + switch (fputil::quick_get_round()) { + case FE_TONEAREST: + case FE_UPWARD: + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT); + return FPBits::inf().get_val(); + default: + return FPBits::max_normal().get_val(); + } + } + } + + if (x_bits.is_pos()) { + if (auto r = COSHF16_EXCEPTS_POS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + } else { + if (auto r = COSHF16_EXCEPTS_NEG.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + } + + return eval_sinh_or_cosh(x); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp index 1c5966c1f1c1262a5628df8ade249ca4fe221bf8..f7a8ee3245eda6e603a7dfffa29c62e09bc3be13 100644 --- a/libc/src/math/generic/exp10f16.cpp +++ b/libc/src/math/generic/exp10f16.cpp @@ -54,16 +54,6 @@ static constexpr fputil::ExceptValues #endif }}; -// Generated by Sollya with the following commands: -// > display = hexadecimal; -// > round(log2(10), SG, RN); -static constexpr float LOG2F_10 = 0x1.a934fp+1f; - -// Generated by Sollya with the following commands: -// > display = hexadecimal; -// > round(log10(2), SG, RN); -static constexpr float LOG10F_2 = 0x1.344136p-2f; - LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { using FPBits = fputil::FPBits; FPBits x_bits(x); @@ -132,40 +122,9 @@ LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) return r.value(); - // For -8 < x < 5, to compute 10^x, we perform the following range reduction: - // find hi, mid, lo, such that: - // x = (hi + mid) * log2(10) + lo, in which - // hi is an integer, - // mid * 2^3 is an integer, - // -2^(-4) <= lo < 2^(-4). - // In particular, - // hi + mid = round(x * 2^3) * 2^(-3). - // Then, - // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo - // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid - // by adding hi to the exponent field of 2^mid. 10^lo is computed using a - // degree-4 minimax polynomial generated by Sollya. - - float xf = x; - float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f)); - int x_hi_mid = static_cast(kf); - int x_hi = x_hi_mid >> 3; - int x_mid = x_hi_mid & 0x7; - // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x - float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf); - - uint32_t exp2_hi_mid_bits = - EXP2_MID_BITS[x_mid] + - static_cast(x_hi << fputil::FPBits::FRACTION_LEN); - float exp2_hi_mid = fputil::FPBits(exp2_hi_mid_bits).get_val(); - // Degree-4 minimax polynomial generated by Sollya with the following - // commands: - // > display = hexadecimal; - // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]); - // > 1 + x * P; - float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f, - 0x1.04b434p+1f, 0x1.2bcf9ep+0f); - return fputil::cast(exp2_hi_mid * exp10_lo); + // 10^x = 2^((hi + mid) * log2(10)) * 10^lo + auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x); + return static_cast(exp2_hi_mid * exp10_lo); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9f2c1959fa5ec99dc695c1df335efde65f98c354 --- /dev/null +++ b/libc/src/math/generic/exp10m1f16.cpp @@ -0,0 +1,163 @@ +//===-- Half-precision 10^x - 1 function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/exp10m1f16.h" +#include "expxf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/properties/cpu_features.h" + +namespace LIBC_NAMESPACE_DECL { + +static constexpr fputil::ExceptValues EXP10M1F16_EXCEPTS_LO = {{ + // (input, RZ output, RU offset, RD offset, RN offset) + // x = 0x1.5c4p-4, exp10m1f16(x) = 0x1.bacp-3 (RZ) + {0x2d71U, 0x32ebU, 1U, 0U, 0U}, + // x = -0x1.5ep-13, exp10m1f16(x) = -0x1.92cp-12 (RZ) + {0x8978U, 0x8e4bU, 0U, 1U, 0U}, + // x = -0x1.e2p-10, exp10m1f16(x) = -0x1.14cp-8 (RZ) + {0x9788U, 0x9c53U, 0U, 1U, 0U}, +}}; + +#ifdef LIBC_TARGET_CPU_HAS_FMA +static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3; +#else +static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6; +#endif + +static constexpr fputil::ExceptValues + EXP10M1F16_EXCEPTS_HI = {{ + // (input, RZ output, RU offset, RD offset, RN offset) + // x = 0x1.8f4p-2, exp10m1f16(x) = 0x1.744p+0 (RZ) + {0x363dU, 0x3dd1U, 1U, 0U, 0U}, + // x = 0x1.95cp-2, exp10m1f16(x) = 0x1.7d8p+0 (RZ) + {0x3657U, 0x3df6U, 1U, 0U, 0U}, + // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ) + {0x3741U, 0x3f5cU, 1U, 0U, 1U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ) + {0x4030U, 0x57b1U, 1U, 0U, 1U}, + // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ) + {0x406eU, 0x5917U, 1U, 0U, 1U}, + // x = 0x1.2f4p+2, exp10m1f16(x) = 0x1.ab8p+15 (RZ) + {0x44bdU, 0x7aaeU, 1U, 0U, 1U}, +#endif + }}; + +LLVM_LIBC_FUNCTION(float16, exp10m1f16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + uint16_t x_abs = x_u & 0x7fffU; + + // When |x| <= 2^(-3), or |x| >= 11 * log10(2), or x is NaN. + if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x429fU)) { + // exp10m1(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // When x >= 16 * log10(2). + if (x_u >= 0x44d1U && x_bits.is_pos()) { + // exp10m1(+inf) = +inf + if (x_bits.is_inf()) + return FPBits::inf().get_val(); + + switch (fputil::quick_get_round()) { + case FE_TONEAREST: + case FE_UPWARD: + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT); + return FPBits::inf().get_val(); + default: + return FPBits::max_normal().get_val(); + } + } + + // When x < -11 * log10(2). + if (x_u > 0xc29fU) { + // exp10m1(-inf) = -1 + if (x_bits.is_inf()) + return FPBits::one(Sign::NEG).get_val(); + + // When x >= -0x1.ce4p+1, round(10^x - 1, HP, RN) = -0x1.ffcp-1. + if (x_u <= 0xc339U) { + return fputil::round_result_slightly_down( + fputil::cast(-0x1.ffcp-1)); + } + + // When x < -0x1.ce4p+1, round(10^x - 1, HP, RN) = -1. + switch (fputil::quick_get_round()) { + case FE_TONEAREST: + case FE_DOWNWARD: + return FPBits::one(Sign::NEG).get_val(); + default: + return fputil::cast(-0x1.ffcp-1); + } + } + + // When |x| <= 2^(-3). + if (x_abs <= 0x3000U) { + if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u); + LIBC_UNLIKELY(r.has_value())) + return r.value(); + + float xf = x; + // Degree-5 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax((10^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]); + // > x * P; + return fputil::cast( + xf * fputil::polyeval(xf, 0x1.26bb1cp+1f, 0x1.5351c8p+1f, + 0x1.04704p+1f, 0x1.2ce084p+0f, 0x1.14a6bep-1f)); + } + } + + // When x is 1, 2, or 3. These are hard-to-round cases with exact results. + // 10^4 - 1 = 9'999 is not exactly representable as a float16, but luckily the + // polynomial approximation gives the correct result for x = 4 in all + // rounding modes. + if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) { + switch (x_u) { + case 0x3c00U: // x = 1.0f16 + return fputil::cast(9.0); + case 0x4000U: // x = 2.0f16 + return fputil::cast(99.0); + case 0x4200U: // x = 3.0f16 + return fputil::cast(999.0); + } + } + + if (auto r = EXP10M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + + // exp10(x) = exp2((hi + mid) * log2(10)) * exp10(lo) + auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x); + // exp10m1(x) = exp2((hi + mid) * log2(lo)) * exp10(lo) - 1 + return fputil::cast( + fputil::multiply_add(exp2_hi_mid, exp10_lo, -1.0f)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h index 35294130a15007be410c05934e693bb091a3321c..7202b1b113190e6f64db5874511fa54e8b5c8d4a 100644 --- a/libc/src/math/generic/expxf16.h +++ b/libc/src/math/generic/expxf16.h @@ -12,6 +12,7 @@ #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/macros/attributes.h" @@ -108,8 +109,8 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) { float xf = x; float kf = fputil::nearest_integer(xf * 0x1.0p+3f); int x_hi_mid = static_cast(kf); - int x_hi = x_hi_mid >> 3; - int x_mid = x_hi_mid & 0x7; + unsigned x_hi = static_cast(x_hi_mid) >> 3; + unsigned x_mid = static_cast(x_hi_mid) & 0x7; // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf); @@ -127,6 +128,166 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) { return {exp2_hi_mid, exp2_lo}; } +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > round(log2(10), SG, RN); +static constexpr float LOG2F_10 = 0x1.a934fp+1f; + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > round(log10(2), SG, RN); +static constexpr float LOG10F_2 = 0x1.344136p-2f; + +LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) { + // For -8 < x < 5, to compute 10^x, we perform the following range reduction: + // find hi, mid, lo, such that: + // x = (hi + mid) * log2(10) + lo, in which + // hi is an integer, + // mid * 2^3 is an integer, + // -2^(-4) <= lo < 2^(-4). + // In particular, + // hi + mid = round(x * 2^3) * 2^(-3). + // Then, + // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo + // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid + // by adding hi to the exponent field of 2^mid. 10^lo is computed using a + // degree-4 minimax polynomial generated by Sollya. + + float xf = x; + float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f)); + int x_hi_mid = static_cast(kf); + unsigned x_hi = static_cast(x_hi_mid) >> 3; + unsigned x_mid = static_cast(x_hi_mid) & 0x7; + // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x + float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf); + + uint32_t exp2_hi_mid_bits = + EXP2_MID_BITS[x_mid] + + static_cast(x_hi << fputil::FPBits::FRACTION_LEN); + float exp2_hi_mid = fputil::FPBits(exp2_hi_mid_bits).get_val(); + // Degree-4 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]); + // > 1 + x * P; + float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f, + 0x1.04b434p+1f, 0x1.2bcf9ep+0f); + return {exp2_hi_mid, exp10_lo}; +} + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > round(log2(exp(1)), SG, RN); +static constexpr float LOG2F_E = 0x1.715476p+0f; + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > round(log(2), SG, RN); +static constexpr float LOGF_2 = 0x1.62e43p-1f; + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > for i from 0 to 31 do printsingle(round(2^(i * 2^-5), SG, RN)); +static constexpr cpp::array EXP2_MID_5_BITS = { + 0x3f80'0000U, 0x3f82'cd87U, 0x3f85'aac3U, 0x3f88'980fU, 0x3f8b'95c2U, + 0x3f8e'a43aU, 0x3f91'c3d3U, 0x3f94'f4f0U, 0x3f98'37f0U, 0x3f9b'8d3aU, + 0x3f9e'f532U, 0x3fa2'7043U, 0x3fa5'fed7U, 0x3fa9'a15bU, 0x3fad'583fU, + 0x3fb1'23f6U, 0x3fb5'04f3U, 0x3fb8'fbafU, 0x3fbd'08a4U, 0x3fc1'2c4dU, + 0x3fc5'672aU, 0x3fc9'b9beU, 0x3fce'248cU, 0x3fd2'a81eU, 0x3fd7'44fdU, + 0x3fdb'fbb8U, 0x3fe0'ccdfU, 0x3fe5'b907U, 0x3fea'c0c7U, 0x3fef'e4baU, + 0x3ff5'257dU, 0x3ffa'83b3U, +}; + +// This function correctly calculates sinh(x) and cosh(x) by calculating exp(x) +// and exp(-x) simultaneously. +// To compute e^x, we perform the following range reduction: +// find hi, mid, lo such that: +// x = (hi + mid) * log(2) + lo, in which +// hi is an integer, +// 0 <= mid * 2^5 < 32 is an integer +// -2^(-5) <= lo * log2(e) <= 2^-5. +// In particular, +// hi + mid = round(x * log2(e) * 2^5) * 2^(-5). +// Then, +// e^x = 2^(hi + mid) * e^lo = 2^hi * 2^mid * e^lo. +// We store 2^mid in the lookup table EXP2_MID_5_BITS, and compute 2^hi * 2^mid +// by adding hi to the exponent field of 2^mid. +// e^lo is computed using a degree-3 minimax polynomial generated by Sollya: +// e^lo ~ P(lo) +// = 1 + lo + c2 * lo^2 + ... + c5 * lo^5 +// = (1 + c2*lo^2 + c4*lo^4) + lo * (1 + c3*lo^2 + c5*lo^4) +// = P_even + lo * P_odd +// To compute e^(-x), notice that: +// e^(-x) = 2^(-(hi + mid)) * e^(-lo) +// ~ 2^(-(hi + mid)) * P(-lo) +// = 2^(-(hi + mid)) * (P_even - lo * P_odd) +// So: +// sinh(x) = (e^x - e^(-x)) / 2 +// ~ 0.5 * (2^(hi + mid) * (P_even + lo * P_odd) - +// 2^(-(hi + mid)) * (P_even - lo * P_odd)) +// = 0.5 * (P_even * (2^(hi + mid) - 2^(-(hi + mid))) + +// lo * P_odd * (2^(hi + mid) + 2^(-(hi + mid)))) +// And similarly: +// cosh(x) = (e^x + e^(-x)) / 2 +// ~ 0.5 * (P_even * (2^(hi + mid) + 2^(-(hi + mid))) + +// lo * P_odd * (2^(hi + mid) - 2^(-(hi + mid)))) +// The main point of these formulas is that the expensive part of calculating +// the polynomials approximating lower parts of e^x and e^(-x) is shared and +// only done once. +template LIBC_INLINE float16 eval_sinh_or_cosh(float16 x) { + float xf = x; + float kf = fputil::nearest_integer(xf * (LOG2F_E * 0x1.0p+5f)); + int x_hi_mid_p = static_cast(kf); + int x_hi_mid_m = -x_hi_mid_p; + + unsigned x_hi_p = static_cast(x_hi_mid_p) >> 5; + unsigned x_hi_m = static_cast(x_hi_mid_m) >> 5; + unsigned x_mid_p = static_cast(x_hi_mid_p) & 0x1f; + unsigned x_mid_m = static_cast(x_hi_mid_m) & 0x1f; + + uint32_t exp2_hi_mid_bits_p = + EXP2_MID_5_BITS[x_mid_p] + + static_cast(x_hi_p << fputil::FPBits::FRACTION_LEN); + uint32_t exp2_hi_mid_bits_m = + EXP2_MID_5_BITS[x_mid_m] + + static_cast(x_hi_m << fputil::FPBits::FRACTION_LEN); + // exp2_hi_mid_p = 2^(hi + mid) + float exp2_hi_mid_p = fputil::FPBits(exp2_hi_mid_bits_p).get_val(); + // exp2_hi_mid_m = 2^(-(hi + mid)) + float exp2_hi_mid_m = fputil::FPBits(exp2_hi_mid_bits_m).get_val(); + + // exp2_hi_mid_sum = 2^(hi + mid) + 2^(-(hi + mid)) + float exp2_hi_mid_sum = exp2_hi_mid_p + exp2_hi_mid_m; + // exp2_hi_mid_diff = 2^(hi + mid) - 2^(-(hi + mid)) + float exp2_hi_mid_diff = exp2_hi_mid_p - exp2_hi_mid_m; + + // lo = x - (hi + mid) = round(x * log2(e) * 2^5) * log(2) * (-2^(-5)) + x + float lo = fputil::multiply_add(kf, LOGF_2 * -0x1.0p-5f, xf); + float lo_sq = lo * lo; + + // Degree-3 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]); + // > 1 + x * P; + constexpr cpp::array COEFFS = {0x1p+0f, 0x1p+0f, 0x1.0004p-1f, + 0x1.555778p-3f}; + float half_p_odd = + fputil::polyeval(lo_sq, COEFFS[1] * 0.5f, COEFFS[3] * 0.5f); + float half_p_even = + fputil::polyeval(lo_sq, COEFFS[0] * 0.5f, COEFFS[2] * 0.5f); + + // sinh(x) = lo * (0.5 * P_odd * (2^(hi + mid) + 2^(-(hi + mid)))) + + // (0.5 * P_even * (2^(hi + mid) - 2^(-(hi + mid)))) + if constexpr (IsSinh) + return fputil::cast(fputil::multiply_add( + lo, half_p_odd * exp2_hi_mid_sum, half_p_even * exp2_hi_mid_diff)); + // cosh(x) = lo * (0.5 * P_odd * (2^(hi + mid) - 2^(-(hi + mid)))) + + // (0.5 * P_even * (2^(hi + mid) + 2^(-(hi + mid)))) + return fputil::cast(fputil::multiply_add( + lo, half_p_odd * exp2_hi_mid_diff, half_p_even * exp2_hi_mid_sum)); +} + } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H diff --git a/libc/src/math/generic/sinhf16.cpp b/libc/src/math/generic/sinhf16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e2dd009dc72c6dab21add2867a570ca21b4116cb --- /dev/null +++ b/libc/src/math/generic/sinhf16.cpp @@ -0,0 +1,144 @@ +//===-- Half-precision sinh(x) function -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/sinhf16.h" +#include "expxf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { + +static constexpr fputil::ExceptValues SINHF16_EXCEPTS_POS = {{ + // x = 0x1.714p-5, sinhf16(x) = 0x1.714p-5 (RZ) + {0x29c5U, 0x29c5U, 1U, 0U, 1U}, + // x = 0x1.25p-4, sinhf16(x) = 0x1.25p-4 (RZ) + {0x2c94U, 0x2c94U, 1U, 0U, 1U}, + // x = 0x1.f5p-4, sinhf16(x) = 0x1.f64p-4 (RZ) + {0x2fd4U, 0x2fd9U, 1U, 0U, 0U}, + // x = 0x1.b1cp-3, sinhf16(x) = 0x1.b4cp-3 (RZ) + {0x32c7U, 0x32d3U, 1U, 0U, 1U}, + // x = 0x1.6e8p-2, sinhf16(x) = 0x1.764p-2 (RZ) + {0x35baU, 0x35d9U, 1U, 0U, 1U}, + // x = 0x1.6b4p-1, sinhf16(x) = 0x1.8a4p-1 (RZ) + {0x39adU, 0x3a29U, 1U, 0U, 1U}, + // x = 0x1.a58p-1, sinhf16(x) = 0x1.d68p-1 (RZ) + {0x3a96U, 0x3b5aU, 1U, 0U, 1U}, + // x = 0x1.574p+0, sinhf16(x) = 0x1.c78p+0 (RZ) + {0x3d5dU, 0x3f1eU, 1U, 0U, 1U}, + // x = 0x1.648p+1, sinhf16(x) = 0x1.024p+3 (RZ) + {0x4192U, 0x4809U, 1U, 0U, 0U}, + // x = 0x1.cdcp+1, sinhf16(x) = 0x1.26cp+4 (RZ) + {0x4337U, 0x4c9bU, 1U, 0U, 0U}, + // x = 0x1.d0cp+1, sinhf16(x) = 0x1.2d8p+4 (RZ) + {0x4343U, 0x4cb6U, 1U, 0U, 1U}, + // x = 0x1.018p+2, sinhf16(x) = 0x1.bfp+4 (RZ) + {0x4406U, 0x4efcU, 1U, 0U, 0U}, + // x = 0x1.2fcp+2, sinhf16(x) = 0x1.cc4p+5 (RZ) + {0x44bfU, 0x5331U, 1U, 0U, 1U}, + // x = 0x1.4ecp+2, sinhf16(x) = 0x1.75cp+6 (RZ) + {0x453bU, 0x55d7U, 1U, 0U, 0U}, + // x = 0x1.8a4p+2, sinhf16(x) = 0x1.d94p+7 (RZ) + {0x4629U, 0x5b65U, 1U, 0U, 1U}, + // x = 0x1.5fp+3, sinhf16(x) = 0x1.c54p+14 (RZ) + {0x497cU, 0x7715U, 1U, 0U, 1U}, +}}; + +static constexpr fputil::ExceptValues SINHF16_EXCEPTS_NEG = {{ + // x = -0x1.714p-5, sinhf16(x) = -0x1.714p-5 (RZ) + {0xa9c5U, 0xa9c5U, 0U, 1U, 1U}, + // x = -0x1.25p-4, sinhf16(x) = -0x1.25p-4 (RZ) + {0xac94U, 0xac94U, 0U, 1U, 1U}, + // x = -0x1.f5p-4, sinhf16(x) = -0x1.f64p-4 (RZ) + {0xafd4U, 0xafd9U, 0U, 1U, 0U}, + // x = -0x1.6e8p-2, sinhf16(x) = -0x1.764p-2 (RZ) + {0xb5baU, 0xb5d9U, 0U, 1U, 1U}, + // x = -0x1.a58p-1, sinhf16(x) = -0x1.d68p-1 (RZ) + {0xba96U, 0xbb5aU, 0U, 1U, 1U}, + // x = -0x1.cdcp+1, sinhf16(x) = -0x1.26cp+4 (RZ) + {0xc337U, 0xcc9bU, 0U, 1U, 0U}, + // x = -0x1.d0cp+1, sinhf16(x) = -0x1.2d8p+4 (RZ) + {0xc343U, 0xccb6U, 0U, 1U, 1U}, + // x = -0x1.018p+2, sinhf16(x) = -0x1.bfp+4 (RZ) + {0xc406U, 0xcefcU, 0U, 1U, 0U}, + // x = -0x1.2fcp+2, sinhf16(x) = -0x1.cc4p+5 (RZ) + {0xc4bfU, 0xd331U, 0U, 1U, 1U}, + // x = -0x1.4ecp+2, sinhf16(x) = -0x1.75cp+6 (RZ) + {0xc53bU, 0xd5d7U, 0U, 1U, 0U}, + // x = -0x1.8a4p+2, sinhf16(x) = -0x1.d94p+7 (RZ) + {0xc629U, 0xdb65U, 0U, 1U, 1U}, + // x = -0x1.5fp+3, sinhf16(x) = -0x1.c54p+14 (RZ) + {0xc97cU, 0xf715U, 0U, 1U, 1U}, +}}; + +LLVM_LIBC_FUNCTION(float16, sinhf16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + uint16_t x_abs = x_u & 0x7fffU; + + // When |x| = 0, or -2^(-14) <= x <= -2^(-9), or |x| >= asinh(2^16), or x is + // NaN. + if (LIBC_UNLIKELY(x_abs == 0U || (x_u >= 0x8400U && x_u <= 0xa400U) || + x_abs >= 0x49e5U)) { + // sinh(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // sinh(+/-0) = sinh(+/-0) + if (x_abs == 0U) + return FPBits::zero(x_bits.sign()).get_val(); + + // When |x| >= asinh(2^16). + if (x_abs >= 0x49e5U) { + // sinh(+/-inf) = +/-inf + if (x_bits.is_inf()) + return FPBits::inf(x_bits.sign()).get_val(); + + int rounding_mode = fputil::quick_get_round(); + if (rounding_mode == FE_TONEAREST || + (x_bits.is_pos() && rounding_mode == FE_UPWARD) || + (x_bits.is_neg() && rounding_mode == FE_DOWNWARD)) { + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT); + return FPBits::inf(x_bits.sign()).get_val(); + } + return FPBits::max_normal(x_bits.sign()).get_val(); + } + + // When -2^(-14) <= x <= -2^(-9). + if (fputil::fenv_is_round_down()) + return FPBits(static_cast(x_u + 1)).get_val(); + return FPBits(static_cast(x_u)).get_val(); + } + + if (x_bits.is_pos()) { + if (auto r = SINHF16_EXCEPTS_POS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + } else { + if (auto r = SINHF16_EXCEPTS_NEG.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + } + + return eval_sinh_or_cosh(x); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/sinpif16.cpp b/libc/src/math/generic/sinpif16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..17cca583e0c0ec7e9a76d72d72cd8aee2e89c9b8 --- /dev/null +++ b/libc/src/math/generic/sinpif16.cpp @@ -0,0 +1,136 @@ +//===-- Half-precision sinpif function ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/sinpif16.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +// Lookup table for sin(k * pi / 32) with k = 0, ..., 63. +// Table is generated with Sollya as follows: +// > display = hexadecimmal; +// > for k from 0 to 63 do { round(sin(k * pi/32), SG, RN); }; +static constexpr float SIN_K_PI_OVER_32[64] = { + 0x0.0p0, 0x1.917a6cp-4, 0x1.8f8b84p-3, 0x1.294062p-2, + 0x1.87de2ap-2, 0x1.e2b5d4p-2, 0x1.1c73b4p-1, 0x1.44cf32p-1, + 0x1.6a09e6p-1, 0x1.8bc806p-1, 0x1.a9b662p-1, 0x1.c38b3p-1, + 0x1.d906bcp-1, 0x1.e9f416p-1, 0x1.f6297cp-1, 0x1.fd88dap-1, + 0x1p0, 0x1.fd88dap-1, 0x1.f6297cp-1, 0x1.e9f416p-1, + 0x1.d906bcp-1, 0x1.c38b3p-1, 0x1.a9b662p-1, 0x1.8bc806p-1, + 0x1.6a09e6p-1, 0x1.44cf32p-1, 0x1.1c73b4p-1, 0x1.e2b5d4p-2, + 0x1.87de2ap-2, 0x1.294062p-2, 0x1.8f8b84p-3, 0x1.917a6cp-4, + 0x0.0p0, -0x1.917a6cp-4, -0x1.8f8b84p-3, -0x1.294062p-2, + -0x1.87de2ap-2, -0x1.e2b5d4p-2, -0x1.1c73b4p-1, -0x1.44cf32p-1, + -0x1.6a09e6p-1, -0x1.8bc806p-1, -0x1.a9b662p-1, -0x1.c38b3p-1, + -0x1.d906bcp-1, -0x1.e9f416p-1, -0x1.f6297ep-1, -0x1.fd88dap-1, + -0x1p0, -0x1.fd88dap-1, -0x1.f6297cp-1, -0x1.e9f416p-1, + -0x1.d906bcp-1, -0x1.c38b3p-1, -0x1.a9b662p-1, -0x1.8bc806p-1, + -0x1.6a09e6p-1, -0x1.44cf32p-1, -0x1.1c73b4p-1, -0x1.e2b5d4p-2, + -0x1.87de2ap-2, -0x1.294062p-2, -0x1.8f8b84p-3, -0x1.917a6cp-4}; + +static LIBC_INLINE int32_t range_reduction(float x, float &y) { + float kf = fputil::nearest_integer(x * 32); + y = fputil::multiply_add(x, 32.0, -kf); + + return static_cast(kf); +} + +LLVM_LIBC_FUNCTION(float16, sinpif16, (float16 x)) { + using FPBits = typename fputil::FPBits; + FPBits xbits(x); + + uint16_t x_u = xbits.uintval(); + uint16_t x_abs = x_u & 0x7fff; + + // Range reduction: + // For |x| > 1/32, we perform range reduction as follows: + // Find k and y such that: + // x = (k + y) * 1/32 + // k is an integer + // |y| < 0.5 + // + // This is done by performing: + // k = round(x * 32) + // y = x * 32 - k + // + // Once k and y are computed, we then deduce the answer by the sine of sum + // formula: + // sin(x * pi) = sin((k + y) * pi/32) + // = sin(k * pi/32) * cos(y * pi/32) + sin (y * pi/32) * cos (k * + // pi/32) + // The values of sin(k * pi/32) and cos (k * pi/32) for k = 0...63 are + // precomputed and stored using a vector of 64 single precision floats. sin(y + // * pi/32) and cos(y * pi/32) are computed using degree-9 chebyshev + // polynomials generated by Sollya. + + // For signed zeros + if (LIBC_UNLIKELY(x_abs == 0U)) + return x; + + // Numbers greater or equal to 2^10 are integers, or infinity, or NaN + if (LIBC_UNLIKELY(x_abs >= 0x6400)) { + // Check for NaN or infinity values + if (LIBC_UNLIKELY(x_abs >= 0x7c00)) { + // If value is equal to infinity + if (x_abs == 0x7c00) { + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_INVALID); + } + + return x + FPBits::quiet_nan().get_val(); + } + return FPBits::zero(xbits.sign()).get_val(); + } + + float f32 = x; + float y; + int32_t k = range_reduction(f32, y); + + float sin_k = SIN_K_PI_OVER_32[k & 63]; + float cos_k = SIN_K_PI_OVER_32[(k + 16) & 63]; + + // Recall; + // sin(x * pi/32) = sin((k + y) * pi/32) + // = sin(y * pi/32) * cos(k * pi/32) + cos(y * pi/32) * sin(k * + // pi/32) Recall, after range reduction, -0.5 <= y <= 0.5. For very small + // values of y, calculating sin(y * p/32) can be inaccurate. Generating a + // polynomial for sin(y * p/32)/y instead significantly reduces the relative + // errors. + float ysq = y * y; + + // Degree-6 minimax even polynomial for sin(y*pi/32)/y generated by Sollya + // with: > Q = fpminimax(sin(y*pi/32)/y, [|0, 2, 4, 6|], [|SG...|], [0, 0.5]); + float sin_y = y * fputil::polyeval(ysq, 0x1.921fb6p-4f, -0x1.4aeabcp-13f, + 0x1.a03354p-21f, -0x1.ad02d2p-20f); + + // Note that cosm1_y = cos(y*pi/32) - 1 = cos_y - 1 + // Derivation: + // sin(x * pi) = sin((k + y) * pi/32) + // = sin_y * cos_k + cos_y * sin_k + // = cos_k * sin_y + sin_k * (1 + cos_y - 1) + // Degree-6 minimax even polynomial for cos(y*pi/32) generated by Sollya with: + // > P = fpminimax(cos(y*pi/32), [|0, 2, 4, 6|],[|1, SG...|], [0, 0.5]); + float cosm1_y = ysq * fputil::polyeval(ysq, -0x1.3bd3ccp-8f, 0x1.03a61ap-18f, + 0x1.a6f7a2p-29f); + + if (LIBC_UNLIKELY(sin_y == 0 && sin_k == 0)) + return FPBits::zero(xbits.sign()).get_val(); + + // Since, cosm1_y = cos_y - 1, therefore: + // sin(x * pi) = cos_k * sin_y + sin_k + (cosm1_y * sin_k) + return fputil::cast(fputil::multiply_add( + sin_y, cos_k, fputil::multiply_add(cosm1_y, sin_k, sin_k))); +} +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/sinhf16.h b/libc/src/math/sinhf16.h new file mode 100644 index 0000000000000000000000000000000000000000..8b8c1b64e7ec8d652ce3f32e73ee38f123fb483f --- /dev/null +++ b/libc/src/math/sinhf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for sinhf16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_SINHF16_H +#define LLVM_LIBC_SRC_MATH_SINHF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 sinhf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_SINHF16_H diff --git a/libc/src/math/sinpif16.h b/libc/src/math/sinpif16.h new file mode 100644 index 0000000000000000000000000000000000000000..33a0ae265840128f00b41398238421340003b6d7 --- /dev/null +++ b/libc/src/math/sinpif16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for sinpif16 ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache Licese v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_SINPIF16_H +#define LLVM_LIBC_SRC_MATH_SINPIF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 sinpif16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_SINPIF16_H diff --git a/libc/src/setjmp/longjmp.h b/libc/src/setjmp/longjmp.h index 7cb12b3392ae16d3bec05d1788f997e583c4d879..9b7db29717216bb9491bb4d1ed3ff5309de7ef35 100644 --- a/libc/src/setjmp/longjmp.h +++ b/libc/src/setjmp/longjmp.h @@ -11,9 +11,22 @@ #include "hdr/types/jmp_buf.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/compiler.h" namespace LIBC_NAMESPACE_DECL { +// TODO(https://github.com/llvm/llvm-project/issues/112427) +// Some of the architecture-specific definitions are marked `naked`, which in +// GCC implies `nothrow`. +// +// Right now, our aliases aren't marked `nothrow`, so we wind up in a situation +// where clang will emit -Wmissing-exception-spec if we add `nothrow` here, but +// GCC will emit -Wmissing-attributes here without `nothrow`. We need to update +// LLVM_LIBC_FUNCTION to denote when a function throws or not. + +#ifdef LIBC_COMPILER_IS_GCC +[[gnu::nothrow]] +#endif void longjmp(jmp_buf buf, int val); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/setjmp/setjmp_impl.h b/libc/src/setjmp/setjmp_impl.h index 4175a7397ae18748180877864aa6ebfebbe8b41d..d035409e5819545c110748d5acfde60daf4be03b 100644 --- a/libc/src/setjmp/setjmp_impl.h +++ b/libc/src/setjmp/setjmp_impl.h @@ -13,9 +13,22 @@ // public header setjmp.h which is also included. here. #include "hdr/types/jmp_buf.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/compiler.h" namespace LIBC_NAMESPACE_DECL { +// TODO(https://github.com/llvm/llvm-project/issues/112427) +// Some of the architecture-specific definitions are marked `naked`, which in +// GCC implies `nothrow`. +// +// Right now, our aliases aren't marked `nothrow`, so we wind up in a situation +// where clang will emit -Wmissing-exception-spec if we add `nothrow` here, but +// GCC will emit -Wmissing-attributes here without `nothrow`. We need to update +// LLVM_LIBC_FUNCTION to denote when a function throws or not. + +#ifdef LIBC_COMPILER_IS_GCC +[[gnu::nothrow]] +#endif int setjmp(jmp_buf buf); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/setjmp/x86_64/CMakeLists.txt b/libc/src/setjmp/x86_64/CMakeLists.txt index c789e5def7fe79ced4d7068cd1894f7dd74281b4..b5b0d9ba65599c6450c5c611ba6cea4e169c69bd 100644 --- a/libc/src/setjmp/x86_64/CMakeLists.txt +++ b/libc/src/setjmp/x86_64/CMakeLists.txt @@ -8,12 +8,6 @@ add_entrypoint_object( libc.hdr.types.jmp_buf COMPILE_OPTIONS -O3 - -fno-omit-frame-pointer - # TODO: Remove once one of these lands: - # https://github.com/llvm/llvm-project/pull/87837 - # https://github.com/llvm/llvm-project/pull/88054 - # https://github.com/llvm/llvm-project/pull/88157 - -ftrivial-auto-var-init=uninitialized ) add_entrypoint_object( diff --git a/libc/src/setjmp/x86_64/longjmp.cpp b/libc/src/setjmp/x86_64/longjmp.cpp index d4b55565cb21877913f75a33c92c95d1550b674b..c293c55a6f9fb2bec416f4c238e57be2d57825aa 100644 --- a/libc/src/setjmp/x86_64/longjmp.cpp +++ b/libc/src/setjmp/x86_64/longjmp.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/setjmp/longjmp.h" +#include "include/llvm-libc-macros/offsetof-macro.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" @@ -16,30 +17,26 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(void, longjmp, (jmp_buf buf, int val)) { - register __UINT64_TYPE__ rbx __asm__("rbx"); - register __UINT64_TYPE__ rbp __asm__("rbp"); - register __UINT64_TYPE__ r12 __asm__("r12"); - register __UINT64_TYPE__ r13 __asm__("r13"); - register __UINT64_TYPE__ r14 __asm__("r14"); - register __UINT64_TYPE__ r15 __asm__("r15"); - register __UINT64_TYPE__ rsp __asm__("rsp"); - register __UINT64_TYPE__ rax __asm__("rax"); +[[gnu::naked]] +LLVM_LIBC_FUNCTION(void, longjmp, (jmp_buf, int)) { + asm(R"( + cmpl $0x1, %%esi + adcl $0x0, %%esi + movq %%rsi, %%rax - // ABI requires that the return value should be stored in rax. So, we store - // |val| in rax. Note that this has to happen before we restore the registers - // from values in |buf|. Otherwise, once rsp and rbp are updated, we cannot - // read |val|. - val = val == 0 ? 1 : val; - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rax) : "m"(val) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rbx) : "m"(buf->rbx) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rbp) : "m"(buf->rbp) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r12) : "m"(buf->r12) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r13) : "m"(buf->r13) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r14) : "m"(buf->r14) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r15) : "m"(buf->r15) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rsp) : "m"(buf->rsp) :); - LIBC_INLINE_ASM("jmp *%0\n\t" : : "m"(buf->rip)); + movq %c[rbx](%%rdi), %%rbx + movq %c[rbp](%%rdi), %%rbp + movq %c[r12](%%rdi), %%r12 + movq %c[r13](%%rdi), %%r13 + movq %c[r14](%%rdi), %%r14 + movq %c[r15](%%rdi), %%r15 + movq %c[rsp](%%rdi), %%rsp + jmpq *%c[rip](%%rdi) + )" ::[rbx] "i"(offsetof(__jmp_buf, rbx)), + [rbp] "i"(offsetof(__jmp_buf, rbp)), [r12] "i"(offsetof(__jmp_buf, r12)), + [r13] "i"(offsetof(__jmp_buf, r13)), [r14] "i"(offsetof(__jmp_buf, r14)), + [r15] "i"(offsetof(__jmp_buf, r15)), [rsp] "i"(offsetof(__jmp_buf, rsp)), + [rip] "i"(offsetof(__jmp_buf, rip))); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/setjmp/x86_64/setjmp.cpp b/libc/src/setjmp/x86_64/setjmp.cpp index 62d9c13c68e4b6682fe02591b08c5c7c07c6e7cc..f6e82642edd7da68d2eec1a9dda83ef59d496eca 100644 --- a/libc/src/setjmp/x86_64/setjmp.cpp +++ b/libc/src/setjmp/x86_64/setjmp.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "include/llvm-libc-macros/offsetof-macro.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/setjmp/setjmp_impl.h" @@ -16,42 +17,29 @@ namespace LIBC_NAMESPACE_DECL { +[[gnu::naked]] LLVM_LIBC_FUNCTION(int, setjmp, (jmp_buf buf)) { - register __UINT64_TYPE__ rbx __asm__("rbx"); - register __UINT64_TYPE__ r12 __asm__("r12"); - register __UINT64_TYPE__ r13 __asm__("r13"); - register __UINT64_TYPE__ r14 __asm__("r14"); - register __UINT64_TYPE__ r15 __asm__("r15"); - - // We want to store the register values as is. So, we will suppress the - // compiler warnings about the uninitialized variables declared above. -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wuninitialized" - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->rbx) : "r"(rbx) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r12) : "r"(r12) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r13) : "r"(r13) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r14) : "r"(r14) :); - LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r15) : "r"(r15) :); -#pragma GCC diagnostic pop - - // We want the rbp of the caller, which is what __builtin_frame_address(1) - // should return. But, compilers generate a warning that calling - // __builtin_frame_address with non-zero argument is unsafe. So, we use - // the knowledge of the x86_64 ABI to fetch the callers rbp. As per the ABI, - // the rbp of the caller is pushed on to the stack and then new top is saved - // in this function's rbp. So, we fetch it from location at which this - // functions's rbp is pointing. - buf->rbp = *reinterpret_cast<__UINTPTR_TYPE__ *>(__builtin_frame_address(0)); - - // The callers stack address is exactly 2 pointer widths ahead of the current - // frame pointer - between the current frame pointer and the rsp of the caller - // are the return address (pushed by the x86_64 call instruction) and the - // previous stack pointer as required by the x86_64 ABI. - // The stack pointer is ahead because the stack grows down on x86_64. - buf->rsp = reinterpret_cast<__UINTPTR_TYPE__>(__builtin_frame_address(0)) + - sizeof(__UINTPTR_TYPE__) * 2; - buf->rip = reinterpret_cast<__UINTPTR_TYPE__>(__builtin_return_address(0)); - return 0; + asm(R"( + mov %%rbx, %c[rbx](%%rdi) + mov %%rbp, %c[rbp](%%rdi) + mov %%r12, %c[r12](%%rdi) + mov %%r13, %c[r13](%%rdi) + mov %%r14, %c[r14](%%rdi) + mov %%r15, %c[r15](%%rdi) + + lea 8(%%rsp), %%rax + mov %%rax, %c[rsp](%%rdi) + + mov (%%rsp), %%rax + mov %%rax, %c[rip](%%rdi) + + xorl %%eax, %%eax + retq)" ::[rbx] "i"(offsetof(__jmp_buf, rbx)), + [rbp] "i"(offsetof(__jmp_buf, rbp)), [r12] "i"(offsetof(__jmp_buf, r12)), + [r13] "i"(offsetof(__jmp_buf, r13)), [r14] "i"(offsetof(__jmp_buf, r14)), + [r15] "i"(offsetof(__jmp_buf, r15)), [rsp] "i"(offsetof(__jmp_buf, rsp)), + [rip] "i"(offsetof(__jmp_buf, rip)) + : "rax"); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/gpu/clearerr.cpp b/libc/src/stdio/gpu/clearerr.cpp index 5826a7bcb95fb74acda97175489442305c9e78a4..4c631b9f946f3fa41f9cdcd1a306a7560a9147ef 100644 --- a/libc/src/stdio/gpu/clearerr.cpp +++ b/libc/src/stdio/gpu/clearerr.cpp @@ -17,8 +17,10 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, clearerr, (::FILE * stream)) { rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); }, - [&](rpc::Buffer *) {}); + [=](rpc::Buffer *buffer, uint32_t) { + buffer->data[0] = file::from_stream(stream); + }, + [&](rpc::Buffer *, uint32_t) {}); port.close(); } diff --git a/libc/src/stdio/gpu/fclose.cpp b/libc/src/stdio/gpu/fclose.cpp index 78caccd90c6931119078209bcc94b371d79510ea..683e0548495d1666c1b00efc2128a0ed257f114c 100644 --- a/libc/src/stdio/gpu/fclose.cpp +++ b/libc/src/stdio/gpu/fclose.cpp @@ -19,8 +19,9 @@ LLVM_LIBC_FUNCTION(int, fclose, (::FILE * stream)) { uint64_t ret = 0; uintptr_t file = reinterpret_cast(stream); rpc::Client::Port port = rpc::client.open(); - port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = file; }, - [&](rpc::Buffer *buffer) { ret = buffer->data[0]; }); + port.send_and_recv( + [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = file; }, + [&](rpc::Buffer *buffer, uint32_t) { ret = buffer->data[0]; }); port.close(); if (ret != 0) diff --git a/libc/src/stdio/gpu/feof.cpp b/libc/src/stdio/gpu/feof.cpp index 4a8a17332a0a9215bd8434a8de9c20192d80aa8d..02adb4ce73d681717d86b3185024691c83474ff6 100644 --- a/libc/src/stdio/gpu/feof.cpp +++ b/libc/src/stdio/gpu/feof.cpp @@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(int, feof, (::FILE * stream)) { int ret; rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); }, - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + [=](rpc::Buffer *buffer, uint32_t) { + buffer->data[0] = file::from_stream(stream); + }, + [&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/ferror.cpp b/libc/src/stdio/gpu/ferror.cpp index 1cee96f5ef23b32e5a15f6885de60809b0d3bc7c..ca777131fd1b3e0983e93d749e2393906dd7ee06 100644 --- a/libc/src/stdio/gpu/ferror.cpp +++ b/libc/src/stdio/gpu/ferror.cpp @@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(int, ferror, (::FILE * stream)) { int ret; rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); }, - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + [=](rpc::Buffer *buffer, uint32_t) { + buffer->data[0] = file::from_stream(stream); + }, + [&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/fflush.cpp b/libc/src/stdio/gpu/fflush.cpp index be267a2e9ce129558244465dba19eee82c3d3baa..577325b70c4e706a7a0da3a0d9cd62d433efc692 100644 --- a/libc/src/stdio/gpu/fflush.cpp +++ b/libc/src/stdio/gpu/fflush.cpp @@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(int, fflush, (::FILE * stream)) { int ret; rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); }, - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + [=](rpc::Buffer *buffer, uint32_t) { + buffer->data[0] = file::from_stream(stream); + }, + [&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/fgets.cpp b/libc/src/stdio/gpu/fgets.cpp index 942f6f0ff03bc1ad3f40f637a90890e2f878d267..fbc1b0cf7d1a87802f69f5ae7b66426c8e657c6c 100644 --- a/libc/src/stdio/gpu/fgets.cpp +++ b/libc/src/stdio/gpu/fgets.cpp @@ -27,7 +27,7 @@ LLVM_LIBC_FUNCTION(char *, fgets, uint64_t recv_size; void *buf = nullptr; rpc::Client::Port port = rpc::client.open(); - port.send([=](rpc::Buffer *buffer) { + port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = count; buffer->data[1] = file::from_stream(stream); }); diff --git a/libc/src/stdio/gpu/file.h b/libc/src/stdio/gpu/file.h index 0856a3430803ae78627a7afb79b473cd52bc309c..16d64e8f37750182420e5ad3a84bfb326c7bc90a 100644 --- a/libc/src/stdio/gpu/file.h +++ b/libc/src/stdio/gpu/file.h @@ -55,13 +55,13 @@ LIBC_INLINE uint64_t write_impl(::FILE *file, const void *data, size_t size) { rpc::Client::Port port = rpc::client.open(); if constexpr (opcode == RPC_WRITE_TO_STREAM) { - port.send([&](rpc::Buffer *buffer) { + port.send([&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = reinterpret_cast(file); }); } port.send_n(data, size); - port.recv([&](rpc::Buffer *buffer) { + port.recv([&](rpc::Buffer *buffer, uint32_t) { ret = reinterpret_cast(buffer->data)[0]; }); port.close(); @@ -81,12 +81,12 @@ LIBC_INLINE uint64_t read_from_stream(::FILE *file, void *buf, size_t size) { uint64_t ret = 0; uint64_t recv_size; rpc::Client::Port port = rpc::client.open(); - port.send([=](rpc::Buffer *buffer) { + port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; buffer->data[1] = from_stream(file); }); port.recv_n(&buf, &recv_size, [&](uint64_t) { return buf; }); - port.recv([&](rpc::Buffer *buffer) { ret = buffer->data[0]; }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { ret = buffer->data[0]; }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/fopen.cpp b/libc/src/stdio/gpu/fopen.cpp index 76daece68ac9d8936abf13180e120a326041eb0c..e165d2acd2109a3acb1f693672e065d4dd6fff20 100644 --- a/libc/src/stdio/gpu/fopen.cpp +++ b/libc/src/stdio/gpu/fopen.cpp @@ -21,10 +21,10 @@ LLVM_LIBC_FUNCTION(::FILE *, fopen, rpc::Client::Port port = rpc::client.open(); port.send_n(path, internal::string_length(path) + 1); port.send_and_recv( - [=](rpc::Buffer *buffer) { + [=](rpc::Buffer *buffer, uint32_t) { inline_memcpy(buffer->data, mode, internal::string_length(mode) + 1); }, - [&](rpc::Buffer *buffer) { file = buffer->data[0]; }); + [&](rpc::Buffer *buffer, uint32_t) { file = buffer->data[0]; }); port.close(); return reinterpret_cast(file); diff --git a/libc/src/stdio/gpu/fseek.cpp b/libc/src/stdio/gpu/fseek.cpp index 4f3e9ce6ec024d4e76323b25f75c29befe134a94..37c40bc602d87ef59c4d22729766eb9895997300 100644 --- a/libc/src/stdio/gpu/fseek.cpp +++ b/libc/src/stdio/gpu/fseek.cpp @@ -18,12 +18,14 @@ LLVM_LIBC_FUNCTION(int, fseek, (::FILE * stream, long offset, int whence)) { int ret; rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { + [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = file::from_stream(stream); buffer->data[1] = static_cast(offset); buffer->data[2] = static_cast(whence); }, - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + [&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/ftell.cpp b/libc/src/stdio/gpu/ftell.cpp index 483b1ad4fee0fe62ee4ee58d7466786d174988e1..226aeda2f8dedc3819769db2f45d2e21b488a660 100644 --- a/libc/src/stdio/gpu/ftell.cpp +++ b/libc/src/stdio/gpu/ftell.cpp @@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(long, ftell, (::FILE * stream)) { long ret; rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); }, - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + [=](rpc::Buffer *buffer, uint32_t) { + buffer->data[0] = file::from_stream(stream); + }, + [&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/remove.cpp b/libc/src/stdio/gpu/remove.cpp index 3f21e8aeff5aedc0f00ab286d04f4c6113a26db7..6604be1c31f2b07fc6d55c4596ffb6a55d01ea6e 100644 --- a/libc/src/stdio/gpu/remove.cpp +++ b/libc/src/stdio/gpu/remove.cpp @@ -18,8 +18,9 @@ LLVM_LIBC_FUNCTION(int, remove, (const char *path)) { int ret; rpc::Client::Port port = rpc::client.open(); port.send_n(path, internal::string_length(path) + 1); - port.recv( - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/rename.cpp b/libc/src/stdio/gpu/rename.cpp index 1087228835842ebf89f254e4fbb0c7fa38448ee2..e6396e212b8b5c1d3cb95dcc8ad35d5e6eb585c1 100644 --- a/libc/src/stdio/gpu/rename.cpp +++ b/libc/src/stdio/gpu/rename.cpp @@ -20,8 +20,9 @@ LLVM_LIBC_FUNCTION(int, rename, (const char *oldpath, const char *newpath)) { rpc::Client::Port port = rpc::client.open(); port.send_n(oldpath, internal::string_length(oldpath) + 1); port.send_n(newpath, internal::string_length(newpath) + 1); - port.recv( - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; diff --git a/libc/src/stdio/gpu/ungetc.cpp b/libc/src/stdio/gpu/ungetc.cpp index e9232a5e43a2704703b5fac9d0272fcbcf7698b0..dce14391b7de4ab9c90b63ac6773214a7b128062 100644 --- a/libc/src/stdio/gpu/ungetc.cpp +++ b/libc/src/stdio/gpu/ungetc.cpp @@ -18,11 +18,13 @@ LLVM_LIBC_FUNCTION(int, ungetc, (int c, ::FILE *stream)) { int ret; rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { + [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = c; buffer->data[1] = file::from_stream(stream); }, - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + [&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; } diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h index 7c012d139ba5dcb646746f56e7fa3d3f35248617..93ce1649869fc10e3770e430894a815b2d2794bc 100644 --- a/libc/src/stdio/gpu/vfprintf_utils.h +++ b/libc/src/stdio/gpu/vfprintf_utils.h @@ -23,14 +23,14 @@ LIBC_INLINE int vfprintf_impl(::FILE *__restrict file, if constexpr (opcode == RPC_PRINTF_TO_STREAM || opcode == RPC_PRINTF_TO_STREAM_PACKED) { - port.send([&](rpc::Buffer *buffer) { + port.send([&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = reinterpret_cast(file); }); } size_t args_size = 0; port.send_n(format, format_size); - port.recv([&](rpc::Buffer *buffer) { + port.recv([&](rpc::Buffer *buffer, uint32_t) { args_size = static_cast(buffer->data[0]); }); port.send_n(vlist, args_size); @@ -38,7 +38,7 @@ LIBC_INLINE int vfprintf_impl(::FILE *__restrict file, uint32_t ret = 0; for (;;) { const char *str = nullptr; - port.recv([&](rpc::Buffer *buffer) { + port.recv([&](rpc::Buffer *buffer, uint32_t) { ret = static_cast(buffer->data[0]); str = reinterpret_cast(buffer->data[1]); }); diff --git a/libc/src/stdlib/gpu/abort.cpp b/libc/src/stdlib/gpu/abort.cpp index fee198607cc029103fba4a6bc66d1803a4d2b211..cfc7e9b8e228bad1b77f98c4adcc72dfe77327c6 100644 --- a/libc/src/stdlib/gpu/abort.cpp +++ b/libc/src/stdlib/gpu/abort.cpp @@ -17,8 +17,9 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, abort, ()) { // We want to first make sure the server is listening before we abort. rpc::Client::Port port = rpc::client.open(); - port.send_and_recv([](rpc::Buffer *) {}, [](rpc::Buffer *) {}); - port.send([&](rpc::Buffer *) {}); + port.send_and_recv([](rpc::Buffer *, uint32_t) {}, + [](rpc::Buffer *, uint32_t) {}); + port.send([&](rpc::Buffer *, uint32_t) {}); port.close(); gpu::end_program(); diff --git a/libc/src/stdlib/gpu/system.cpp b/libc/src/stdlib/gpu/system.cpp index acf3a8c941ffa96e2d7da403b65c15134e6808d0..1890006512de4f4c82803a2f802e52ec3ab25020 100644 --- a/libc/src/stdlib/gpu/system.cpp +++ b/libc/src/stdlib/gpu/system.cpp @@ -19,8 +19,9 @@ LLVM_LIBC_FUNCTION(int, system, (const char *command)) { int ret; rpc::Client::Port port = rpc::client.open(); port.send_n(command, internal::string_length(command) + 1); - port.recv( - [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { + ret = static_cast(buffer->data[0]); + }); port.close(); return ret; diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp index 674e2cc1ed7499f93271b8afbd81ea7e448f88d8..2dafa911783ffc43bfd9928e590630c8e61664e4 100644 --- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp +++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp @@ -18,19 +18,26 @@ using namespace LIBC_NAMESPACE; static void test_interface(bool end_with_send) { uint64_t cnt = 0; rpc::Client::Port port = rpc::client.open(); - port.send([&](rpc::Buffer *buffer) { buffer->data[0] = end_with_send; }); - port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; }); - port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; }); - port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; }); - port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; }); - port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; }); - port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; }); - port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; }); - port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; }); + port.send( + [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = end_with_send; }); + port.send( + [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); + port.send( + [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); + port.send( + [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; }); + port.send( + [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); if (end_with_send) - port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; }); + port.send([&](rpc::Buffer *buffer, uint32_t) { + buffer->data[0] = cnt = cnt + 1; + }); else - port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; }); + port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); port.close(); ASSERT_TRUE(cnt == 9 && "Invalid number of increments"); diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp index 4032d890c53ec89c47fa39560e84ae49f7f633a6..bec8171180a0550438b03ea22cc774da4224a02b 100644 --- a/libc/test/integration/startup/gpu/rpc_test.cpp +++ b/libc/test/integration/startup/gpu/rpc_test.cpp @@ -20,10 +20,10 @@ static void test_add_simple() { for (uint32_t i = 0; i < num_additions; ++i) { rpc::Client::Port port = rpc::client.open(); port.send_and_recv( - [=](rpc::Buffer *buffer) { + [=](rpc::Buffer *buffer, uint32_t) { reinterpret_cast(buffer->data)[0] = cnt; }, - [&](rpc::Buffer *buffer) { + [&](rpc::Buffer *buffer, uint32_t) { cnt = reinterpret_cast(buffer->data)[0]; }); port.close(); @@ -34,7 +34,7 @@ static void test_add_simple() { // Test to ensure that the RPC mechanism doesn't hang on divergence. static void test_noop(uint8_t data) { rpc::Client::Port port = rpc::client.open(); - port.send([=](rpc::Buffer *buffer) { buffer->data[0] = data; }); + port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = data; }); port.close(); } diff --git a/libc/test/src/__support/big_int_test.cpp b/libc/test/src/__support/big_int_test.cpp index a1ce69baaae2906d36d67cccaaccc1b55d1302b3..471ca72a8f6e0c8764049e12a8964beac4eec9c1 100644 --- a/libc/test/src/__support/big_int_test.cpp +++ b/libc/test/src/__support/big_int_test.cpp @@ -8,7 +8,7 @@ #include "src/__support/CPP/optional.h" #include "src/__support/big_int.h" -#include "src/__support/integer_literals.h" // parse_unsigned_bigint +#include "src/__support/integer_literals.h" // parse_unsigned_bigint #include "src/__support/macros/config.h" #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128 @@ -208,6 +208,7 @@ TYPED_TEST(LlvmLibcUIntClassTest, CountBits, Types) { } using LL_UInt16 = UInt<16>; +using LL_UInt32 = UInt<32>; using LL_UInt64 = UInt<64>; // We want to test UInt<128> explicitly. So, for // convenience, we use a sugar which does not conflict with the UInt128 type @@ -927,4 +928,143 @@ TEST(LlvmLibcUIntClassTest, OtherWordTypeTests) { ASSERT_EQ(static_cast(a >> 64), 1); } +TEST(LlvmLibcUIntClassTest, OtherWordTypeCastTests) { + using LL_UInt96 = BigInt<96, false, uint32_t>; + + LL_UInt96 a({123, 456, 789}); + + ASSERT_EQ(static_cast(a), 123); + ASSERT_EQ(static_cast(a >> 32), 456); + ASSERT_EQ(static_cast(a >> 64), 789); + + // Bigger word with more bits to smaller word with less bits. + LL_UInt128 b(a); + + ASSERT_EQ(static_cast(b), 123); + ASSERT_EQ(static_cast(b >> 32), 456); + ASSERT_EQ(static_cast(b >> 64), 789); + ASSERT_EQ(static_cast(b >> 96), 0); + + b = (b << 32) + 987; + + ASSERT_EQ(static_cast(b), 987); + ASSERT_EQ(static_cast(b >> 32), 123); + ASSERT_EQ(static_cast(b >> 64), 456); + ASSERT_EQ(static_cast(b >> 96), 789); + + // Smaller word with less bits to bigger word with more bits. + LL_UInt96 c(b); + + ASSERT_EQ(static_cast(c), 987); + ASSERT_EQ(static_cast(c >> 32), 123); + ASSERT_EQ(static_cast(c >> 64), 456); + + // Smaller word with more bits to bigger word with less bits + LL_UInt64 d(c); + + ASSERT_EQ(static_cast(d), 987); + ASSERT_EQ(static_cast(d >> 32), 123); + + // Bigger word with less bits to smaller word with more bits + + LL_UInt96 e(d); + + ASSERT_EQ(static_cast(e), 987); + ASSERT_EQ(static_cast(e >> 32), 123); + + e = (e << 32) + 654; + + ASSERT_EQ(static_cast(e), 654); + ASSERT_EQ(static_cast(e >> 32), 987); + ASSERT_EQ(static_cast(e >> 64), 123); +} + +TEST(LlvmLibcUIntClassTest, SignedOtherWordTypeCastTests) { + using LL_Int64 = BigInt<64, true, uint64_t>; + using LL_Int96 = BigInt<96, true, uint32_t>; + + LL_Int64 zero_64(0); + LL_Int96 zero_96(0); + LL_Int192 zero_192(0); + + LL_Int96 plus_a({0x1234, 0x5678, 0x9ABC}); + + ASSERT_EQ(static_cast(plus_a), 0x1234); + ASSERT_EQ(static_cast(plus_a >> 32), 0x5678); + ASSERT_EQ(static_cast(plus_a >> 64), 0x9ABC); + + LL_Int96 minus_a(-plus_a); + + // The reason that the numbers are inverted and not negated is that we're + // using two's complement. To negate a two's complement number you flip the + // bits and add 1, so minus_a is {~0x1234, ~0x5678, ~0x9ABC} + {1,0,0}. + ASSERT_EQ(static_cast(minus_a), (~0x1234) + 1); + ASSERT_EQ(static_cast(minus_a >> 32), ~0x5678); + ASSERT_EQ(static_cast(minus_a >> 64), ~0x9ABC); + + ASSERT_TRUE(plus_a + minus_a == zero_96); + + // 192 so there's an extra block to get sign extended to + LL_Int192 bigger_plus_a(plus_a); + + ASSERT_EQ(static_cast(bigger_plus_a), 0x1234); + ASSERT_EQ(static_cast(bigger_plus_a >> 32), 0x5678); + ASSERT_EQ(static_cast(bigger_plus_a >> 64), 0x9ABC); + ASSERT_EQ(static_cast(bigger_plus_a >> 96), 0); + ASSERT_EQ(static_cast(bigger_plus_a >> 128), 0); + ASSERT_EQ(static_cast(bigger_plus_a >> 160), 0); + + LL_Int192 bigger_minus_a(minus_a); + + ASSERT_EQ(static_cast(bigger_minus_a), (~0x1234) + 1); + ASSERT_EQ(static_cast(bigger_minus_a >> 32), ~0x5678); + ASSERT_EQ(static_cast(bigger_minus_a >> 64), ~0x9ABC); + ASSERT_EQ(static_cast(bigger_minus_a >> 96), ~0); + ASSERT_EQ(static_cast(bigger_minus_a >> 128), ~0); + ASSERT_EQ(static_cast(bigger_minus_a >> 160), ~0); + + ASSERT_TRUE(bigger_plus_a + bigger_minus_a == zero_192); + + LL_Int64 smaller_plus_a(plus_a); + + ASSERT_EQ(static_cast(smaller_plus_a), 0x1234); + ASSERT_EQ(static_cast(smaller_plus_a >> 32), 0x5678); + + LL_Int64 smaller_minus_a(minus_a); + + ASSERT_EQ(static_cast(smaller_minus_a), (~0x1234) + 1); + ASSERT_EQ(static_cast(smaller_minus_a >> 32), ~0x5678); + + ASSERT_TRUE(smaller_plus_a + smaller_minus_a == zero_64); + + // Also try going from bigger word size to smaller word size + LL_Int96 smaller_back_plus_a(smaller_plus_a); + + ASSERT_EQ(static_cast(smaller_back_plus_a), 0x1234); + ASSERT_EQ(static_cast(smaller_back_plus_a >> 32), 0x5678); + ASSERT_EQ(static_cast(smaller_back_plus_a >> 64), 0); + + LL_Int96 smaller_back_minus_a(smaller_minus_a); + + ASSERT_EQ(static_cast(smaller_back_minus_a), (~0x1234) + 1); + ASSERT_EQ(static_cast(smaller_back_minus_a >> 32), ~0x5678); + ASSERT_EQ(static_cast(smaller_back_minus_a >> 64), ~0); + + ASSERT_TRUE(smaller_back_plus_a + smaller_back_minus_a == zero_96); + + LL_Int96 bigger_back_plus_a(bigger_plus_a); + + ASSERT_EQ(static_cast(bigger_back_plus_a), 0x1234); + ASSERT_EQ(static_cast(bigger_back_plus_a >> 32), 0x5678); + ASSERT_EQ(static_cast(bigger_back_plus_a >> 64), 0x9ABC); + + LL_Int96 bigger_back_minus_a(bigger_minus_a); + + ASSERT_EQ(static_cast(bigger_back_minus_a), (~0x1234) + 1); + ASSERT_EQ(static_cast(bigger_back_minus_a >> 32), ~0x5678); + ASSERT_EQ(static_cast(bigger_back_minus_a >> 64), ~0x9ABC); + + ASSERT_TRUE(bigger_back_plus_a + bigger_back_minus_a == zero_96); +} + } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 07a9405081f97d1e65e1ed6c761fe9b6ac523978..381a3f478f3761b466e30c045eed70bceec03f5c 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -90,6 +90,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + sinpif16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + sinpif16_test.cpp + DEPENDS + libc.src.math.sinpif16 +) + add_fp_unittest( sin_test NEED_MPFR @@ -1051,6 +1062,17 @@ add_fp_unittest( libc.src.math.exp10f16 ) +add_fp_unittest( + exp10m1f16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + exp10m1f16_test.cpp + DEPENDS + libc.src.math.exp10m1f16 +) + add_fp_unittest( copysign_test SUITE @@ -1894,6 +1916,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + coshf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + coshf16_test.cpp + DEPENDS + libc.src.math.coshf16 +) + add_fp_unittest( sinhf_test NEED_MPFR @@ -1910,6 +1943,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + sinhf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + sinhf16_test.cpp + DEPENDS + libc.src.math.sinhf16 +) + add_fp_unittest( tanhf_test NEED_MPFR diff --git a/libc/test/src/math/coshf16_test.cpp b/libc/test/src/math/coshf16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a0d1fd21104788208aa379582f60a2a7f7b55a50 --- /dev/null +++ b/libc/test/src/math/coshf16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for coshf16 ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/coshf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0]; +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcCoshf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x, + LIBC_NAMESPACE::coshf16(x), 0.5); + } +} + +TEST_F(LlvmLibcCoshf16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x, + LIBC_NAMESPACE::coshf16(x), 0.5); + } +} diff --git a/libc/test/src/math/exp10m1f16_test.cpp b/libc/test/src/math/exp10m1f16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..41bb12f7d0973af41b97ebff7ece1a7f4b201e56 --- /dev/null +++ b/libc/test/src/math/exp10m1f16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for exp10m1f16 ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/exp10m1f16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0]; +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcExp10m1f16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, + LIBC_NAMESPACE::exp10m1f16(x), 0.5); + } +} + +TEST_F(LlvmLibcExp10m1f16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, + LIBC_NAMESPACE::exp10m1f16(x), 0.5); + } +} diff --git a/libc/test/src/math/sinhf16_test.cpp b/libc/test/src/math/sinhf16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a16ab9279c4576473a16321c728f122115c316a2 --- /dev/null +++ b/libc/test/src/math/sinhf16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for sinhf16 ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/sinhf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0]; +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcSinhf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x, + LIBC_NAMESPACE::sinhf16(x), 0.5); + } +} + +TEST_F(LlvmLibcSinhf16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x, + LIBC_NAMESPACE::sinhf16(x), 0.5); + } +} diff --git a/libc/test/src/math/sinpif16_test.cpp b/libc/test/src/math/sinpif16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8477124b2e6e37442ca6413c14325f5957f557b4 --- /dev/null +++ b/libc/test/src/math/sinpif16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for sinpif16 --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "src/math/sinpif16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf] +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0] +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcSinpif16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinpi, x, + LIBC_NAMESPACE::sinpif16(x), 0.5); + } +} + +TEST_F(LlvmLibcSinpif16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinpi, x, + LIBC_NAMESPACE::sinpif16(x), 0.5); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index c4787229c3ec15288c0c98c4f5b253731c95e159..f713430ee27ce8d18127717d1bbc5847b127c438 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -51,6 +51,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + sinpif16_test + SUITE + libc-math-smoke-tests + SRCS + sinpif16_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.sinpif16 +) + add_fp_unittest( sincosf_test SUITE @@ -1224,6 +1235,19 @@ add_fp_unittest( libc.src.__support.FPUtil.cast ) +add_fp_unittest( + exp10m1f16_test + SUITE + libc-math-smoke-tests + SRCS + exp10m1f16_test.cpp + DEPENDS + libc.hdr.fenv_macros + libc.src.errno.errno + libc.src.math.exp10m1f16 + libc.src.__support.FPUtil.cast +) + add_fp_unittest( copysign_test SUITE @@ -3704,6 +3728,19 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + coshf16_test + SUITE + libc-math-smoke-tests + SRCS + coshf16_test.cpp + DEPENDS + libc.hdr.fenv_macros + libc.src.errno.errno + libc.src.math.coshf16 + libc.src.__support.FPUtil.cast +) + add_fp_unittest( sinhf_test SUITE @@ -3717,6 +3754,19 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + sinhf16_test + SUITE + libc-math-smoke-tests + SRCS + sinhf16_test.cpp + DEPENDS + libc.hdr.fenv_macros + libc.src.errno.errno + libc.src.math.sinhf16 + libc.src.__support.FPUtil.cast +) + add_fp_unittest( tanhf_test SUITE diff --git a/libc/test/src/math/smoke/coshf16_test.cpp b/libc/test/src/math/smoke/coshf16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..08d05ecce86baa4cc129495dfef9b9721ec8df0a --- /dev/null +++ b/libc/test/src/math/smoke/coshf16_test.cpp @@ -0,0 +1,90 @@ +//===-- Unittests for coshf16 ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" +#include "src/errno/libc_errno.h" +#include "src/math/coshf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::coshf16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::coshf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::coshf16(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::coshf16(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0), + LIBC_NAMESPACE::coshf16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0), + LIBC_NAMESPACE::coshf16(neg_zero)); + EXPECT_MATH_ERRNO(0); +} + +TEST_F(LlvmLibcCoshf16Test, Overflow) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(neg_max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + // round(acosh(2^16), HP, RU); + float16 x = LIBC_NAMESPACE::fputil::cast(0x1.794p+3); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::coshf16(x), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(inf, LIBC_NAMESPACE::coshf16(x), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + // round(-acosh(2^16), HP, RD); + x = LIBC_NAMESPACE::fputil::cast(-0x1.794p+3); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::coshf16(x), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(inf, LIBC_NAMESPACE::coshf16(x), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); +} diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dfa7fa477d3d1174d5a7dd87f22dc5499c9282fa --- /dev/null +++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp @@ -0,0 +1,113 @@ +//===-- Unittests for exp10m1f16 ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" +#include "src/errno/libc_errno.h" +#include "src/math/exp10m1f16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f16(sNaN), + FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp10m1f16(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(-1.0), + LIBC_NAMESPACE::exp10m1f16(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp10m1f16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::exp10m1f16(neg_zero)); + EXPECT_MATH_ERRNO(0); +} + +TEST_F(LlvmLibcExp10m1f16Test, Overflow) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + // round(16 * log10(2), HP, RN); + float16 x = LIBC_NAMESPACE::fputil::cast(0x1.344p+2); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); +} + +TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(-1.0), + LIBC_NAMESPACE::exp10m1f16(neg_max_normal), + FE_INEXACT); + + // round(-11 * log10(2), HP, RD); + float16 x = LIBC_NAMESPACE::fputil::cast(-0x1.a8p+1); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + LIBC_NAMESPACE::fputil::cast(-1.0), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + + // Next float16 value below -0x1.ce4p+1. + x = LIBC_NAMESPACE::fputil::cast(-0x1.ce8p+1); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + LIBC_NAMESPACE::fputil::cast(-1.0), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + LIBC_NAMESPACE::fputil::cast(-1.0), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT); +} diff --git a/libc/test/src/math/smoke/sinhf16_test.cpp b/libc/test/src/math/smoke/sinhf16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4f21d33ba78e0c714ff7ff6ec3955049d8321d15 --- /dev/null +++ b/libc/test/src/math/smoke/sinhf16_test.cpp @@ -0,0 +1,88 @@ +//===-- Unittests for sinhf16 ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" +#include "src/errno/libc_errno.h" +#include "src/math/sinhf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sinhf16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinhf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::sinhf16(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, LIBC_NAMESPACE::sinhf16(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::sinhf16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::sinhf16(neg_zero)); + EXPECT_MATH_ERRNO(0); +} + +TEST_F(LlvmLibcSinhf16Test, Overflow) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::sinhf16(max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, LIBC_NAMESPACE::sinhf16(neg_max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + // round(asinh(2^16), HP, RU); + float16 x = LIBC_NAMESPACE::fputil::cast(0x1.794p+3); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::sinhf16(x), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(inf, LIBC_NAMESPACE::sinhf16(x), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + // round(asinh(-2^16), HP, RD); + x = LIBC_NAMESPACE::fputil::cast(-0x1.794p+3); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + neg_inf, LIBC_NAMESPACE::sinhf16(x), FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + neg_max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + neg_inf, LIBC_NAMESPACE::sinhf16(x), FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + neg_max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); +} diff --git a/libc/test/src/math/smoke/sinpif16_test.cpp b/libc/test/src/math/smoke/sinpif16_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0bcd38a60d84993d7129e0dedcb2dcbe97e996ad --- /dev/null +++ b/libc/test/src/math/smoke/sinpif16_test.cpp @@ -0,0 +1,42 @@ +//===-- Unittests for sinpif16 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/errno/libc_errno.h" +#include "src/math/sinpif16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcSinpif16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(neg_zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif16(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} + +TEST_F(LlvmLibcSinpif16Test, Integers) { + EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(-0x420)); + EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(-0x1p+10)); + EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(-0x1.4p+14)); + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(0x420)); + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(0x1.cp+15)); + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(0x1.cp+7)); +} diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index 27ff1f7190ef95308992fa443fd33bfa76ce57b4..bd4fbe294a622d3b66fbf18aadb32c202233a4cb 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -334,6 +334,29 @@ public: return result; } + MPFRNumber exp10m1() const { + // TODO: Only use mpfr_exp10m1 once CI and buildbots get MPFR >= 4.2.0. +#if MPFR_VERSION_MAJOR > 4 || \ + (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2) + MPFRNumber result(*this); + mpfr_exp10m1(result.value, value, mpfr_rounding); + return result; +#else + unsigned int prec = mpfr_precision * 3; + MPFRNumber result(*this, prec); + + MPFRNumber ln10(10.0f, prec); + // log(10) + mpfr_log(ln10.value, ln10.value, mpfr_rounding); + // x * log(10) + mpfr_mul(result.value, value, ln10.value, mpfr_rounding); + // e^(x * log(10)) - 1 + int ex = mpfr_expm1(result.value, result.value, mpfr_rounding); + mpfr_subnormalize(result.value, ex, mpfr_rounding); + return result; +#endif + } + MPFRNumber expm1() const { MPFRNumber result(*this); mpfr_expm1(result.value, value, mpfr_rounding); @@ -488,14 +511,28 @@ public: (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2) mpfr_sinpi(result.value, value, mpfr_rounding); + return result; #else + if (mpfr_integer_p(value)) { + mpfr_set_si(result.value, 0, mpfr_rounding); + return result; + } + + MPFRNumber value_mul_two(*this); + mpfr_mul_si(value_mul_two.value, value, 2, MPFR_RNDN); + + if (mpfr_integer_p(value_mul_two.value)) { + auto d = mpfr_get_si(value, MPFR_RNDD); + mpfr_set_si(result.value, (d & 1) ? -1 : 1, mpfr_rounding); + return result; + } + MPFRNumber value_pi(0.0, 1280); mpfr_const_pi(value_pi.value, MPFR_RNDN); mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN); mpfr_sin(result.value, value_pi.value, mpfr_rounding); -#endif - return result; +#endif } MPFRNumber sinh() const { @@ -730,6 +767,8 @@ unary_operation(Operation op, InputType input, unsigned int precision, return mpfrInput.exp2m1(); case Operation::Exp10: return mpfrInput.exp10(); + case Operation::Exp10m1: + return mpfrInput.exp10m1(); case Operation::Expm1: return mpfrInput.expm1(); case Operation::Floor: diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h index 8d51fa4e4772674d40c8a713a86ca7ddb0644984..9fc12a6adefb56e773f29f301674b5067e220a7e 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.h +++ b/libc/utils/MPFRWrapper/MPFRUtils.h @@ -42,6 +42,7 @@ enum class Operation : int { Exp2, Exp2m1, Exp10, + Exp10m1, Expm1, Floor, Log, diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp index ca10e67509ae631532f925e43b2b0d4891594c4c..11b6d0e27ab9481d23f4cbd65fdadd145b459ccd 100644 --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -302,8 +302,8 @@ rpc_status_t handle_server_impl( } case RPC_EXIT: { // Send a response to the client to signal that we are ready to exit. - port->recv_and_send([](rpc::Buffer *) {}); - port->recv([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *, uint32_t) {}); + port->recv([](rpc::Buffer *buffer, uint32_t) { int status = 0; std::memcpy(&status, buffer->data, sizeof(int)); exit(status); @@ -312,8 +312,8 @@ rpc_status_t handle_server_impl( } case RPC_ABORT: { // Send a response to the client to signal that we are ready to abort. - port->recv_and_send([](rpc::Buffer *) {}); - port->recv([](rpc::Buffer *) {}); + port->recv_and_send([](rpc::Buffer *, uint32_t) {}); + port->recv([](rpc::Buffer *, uint32_t) {}); abort(); break; } @@ -334,25 +334,25 @@ rpc_status_t handle_server_impl( break; } case RPC_FEOF: { - port->recv_and_send([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = feof(file::to_stream(buffer->data[0])); }); break; } case RPC_FERROR: { - port->recv_and_send([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = ferror(file::to_stream(buffer->data[0])); }); break; } case RPC_CLEARERR: { - port->recv_and_send([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { clearerr(file::to_stream(buffer->data[0])); }); break; } case RPC_FSEEK: { - port->recv_and_send([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = fseek(file::to_stream(buffer->data[0]), static_cast(buffer->data[1]), static_cast(buffer->data[2])); @@ -360,19 +360,19 @@ rpc_status_t handle_server_impl( break; } case RPC_FTELL: { - port->recv_and_send([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = ftell(file::to_stream(buffer->data[0])); }); break; } case RPC_FFLUSH: { - port->recv_and_send([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = fflush(file::to_stream(buffer->data[0])); }); break; } case RPC_UNGETC: { - port->recv_and_send([](rpc::Buffer *buffer) { + port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = ungetc(static_cast(buffer->data[0]), file::to_stream(buffer->data[1])); }); @@ -429,7 +429,7 @@ rpc_status_t handle_server_impl( break; } case RPC_NOOP: { - port->recv([](rpc::Buffer *) {}); + port->recv([](rpc::Buffer *, uint32_t) {}); break; } default: { @@ -552,7 +552,7 @@ uint64_t rpc_get_client_size() { return sizeof(rpc::Client); } void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) { auto port = reinterpret_cast(ref.handle); - port->send([=](rpc::Buffer *buffer) { + port->send([=](rpc::Buffer *buffer, uint32_t) { callback(reinterpret_cast(buffer), data); }); } @@ -564,7 +564,7 @@ void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) { void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) { auto port = reinterpret_cast(ref.handle); - port->recv([=](rpc::Buffer *buffer) { + port->recv([=](rpc::Buffer *buffer, uint32_t) { callback(reinterpret_cast(buffer), data); }); } @@ -579,7 +579,7 @@ void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc, void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) { auto port = reinterpret_cast(ref.handle); - port->recv_and_send([=](rpc::Buffer *buffer) { + port->recv_and_send([=](rpc::Buffer *buffer, uint32_t) { callback(reinterpret_cast(buffer), data); }); } diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 75c926f5432aeae688983cecf73b5e597bbc0bc3..574b262018cd3a5c92342b222a55f9c6030487f0 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -19,7 +19,6 @@ set(CMAKE_FOLDER "libc++") set(LIBCXX_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(LIBCXX_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) -set(LIBCXX_BINARY_INCLUDE_DIR "${LIBCXX_BINARY_DIR}/include/c++build") include(GNUInstallDirs) include(WarningFlags) @@ -443,8 +442,6 @@ else() "Path where target-specific libc++ headers should be installed.") endif() -file(MAKE_DIRECTORY "${LIBCXX_BINARY_INCLUDE_DIR}") - set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBCXX_LIBRARY_DIR}) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBCXX_LIBRARY_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBCXX_LIBRARY_DIR}) @@ -459,7 +456,9 @@ set(LIBCXX_COMPILE_FLAGS "") set(LIBCXX_LINK_FLAGS "") set(LIBCXX_LIBRARIES "") set(LIBCXX_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING - "Additional Compile only flags which can be provided in cache") + "Additional compile flags to use when building libc++. This should be a CMake ;-delimited list of individual + compiler options to use. For options that must be passed as-is to the compiler without deduplication (e.g. + `-Xclang -foo` option groups), consider using `SHELL:` (https://cmake.org/cmake/help/latest/command/add_compile_options.html#option-de-duplication).") set(LIBCXX_ADDITIONAL_LIBRARIES "" CACHE STRING "Additional libraries libc++ is linked to which can be provided in cache") diff --git a/libcxx/appveyor-reqs-install.cmd b/libcxx/appveyor-reqs-install.cmd deleted file mode 100644 index e3bd018dd3742eb5eb613985bab70bed9be866e0..0000000000000000000000000000000000000000 --- a/libcxx/appveyor-reqs-install.cmd +++ /dev/null @@ -1,53 +0,0 @@ -@echo on - -if NOT EXIST C:\projects\deps ( - mkdir C:\projects\deps -) -cd C:\projects\deps - -::########################################################################### -:: Setup Compiler -::########################################################################### -if NOT EXIST llvm-installer.exe ( - appveyor DownloadFile https://prereleases.llvm.org/win-snapshots/LLVM-9.0.0-r357435-win32.exe -FileName llvm-installer.exe -) -if "%CLANG_VERSION%"=="ToT" ( - START /WAIT llvm-installer.exe /S /D=C:\"Program Files\LLVM" -) -if DEFINED CLANG_VERSION @set PATH="C:\Program Files\LLVM\bin";%PATH% -if DEFINED CLANG_VERSION clang-cl -v - -if DEFINED MINGW_PATH rename "C:\Program Files\Git\usr\bin\sh.exe" "sh-ignored.exe" -if DEFINED MINGW_PATH @set "PATH=%PATH:C:\Program Files (x86)\Git\bin=%" -if DEFINED MINGW_PATH @set "PATH=%PATH%;%MINGW_PATH%" -if DEFINED MINGW_PATH g++ -v - -::########################################################################### -:: Install a recent CMake -::########################################################################### -if NOT EXIST cmake ( - appveyor DownloadFile https://cmake.org/files/v3.7/cmake-3.7.2-win64-x64.zip -FileName cmake.zip - 7z x cmake.zip -oC:\projects\deps > nul - move C:\projects\deps\cmake-* C:\projects\deps\cmake - rm cmake.zip -) -@set PATH=C:\projects\deps\cmake\bin;%PATH% -cmake --version - -::########################################################################### -:: Install Ninja -::########################################################################### -if NOT EXIST ninja ( - appveyor DownloadFile https://github.com/ninja-build/ninja/releases/download/v1.6.0/ninja-win.zip -FileName ninja.zip - 7z x ninja.zip -oC:\projects\deps\ninja > nul - rm ninja.zip -) -@set PATH=C:\projects\deps\ninja;%PATH% -ninja --version - -::########################################################################### -:: Setup the cached copy of LLVM -::########################################################################### -git clone --depth=1 http://llvm.org/git/llvm.git - -@echo off diff --git a/libcxx/appveyor.yml b/libcxx/appveyor.yml deleted file mode 100644 index 8a69cb9e7dde0e6e821907d3e5719be6c4d5a9cb..0000000000000000000000000000000000000000 --- a/libcxx/appveyor.yml +++ /dev/null @@ -1,71 +0,0 @@ -version: '{build}' - -shallow_clone: true - -build: - verbosity: detailed - -configuration: - - Debug - -environment: - matrix: - - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 - CMAKE_OPTIONS: -DCMAKE_C_COMPILER=clang-cl.exe -DCMAKE_CXX_COMPILER=clang-cl.exe - CLANG_VERSION: ToT - MSVC_SETUP_PATH: C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat - MSVC_SETUP_ARG: x86 - GENERATOR: Ninja - MAKE_PROGRAM: ninja - APPVEYOR_SAVE_CACHE_ON_ERROR: true -# TODO: Maybe re-enable this configuration? Do we want to support MSVC 2015's runtime? -# - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 -# MINGW_PATH: C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin -# GENERATOR: MinGW Makefiles -# MAKE_PROGRAM: mingw32-make -# APPVEYOR_SAVE_CACHE_ON_ERROR: true - -install: - ############################################################################ - # All external dependencies are installed in C:\projects\deps - ############################################################################ - - call "%APPVEYOR_BUILD_FOLDER%\\appveyor-reqs-install.cmd" - -before_build: - - if DEFINED MSVC_SETUP_PATH call "%MSVC_SETUP_PATH%" %MSVC_SETUP_ARG% - - cd %APPVEYOR_BUILD_FOLDER% - -build_script: - - md C:\projects\build-libcxx - - cd C:\projects\build-libcxx - - echo %configuration% - - ############################################################################# - # Configuration Step - ############################################################################# - - cmake -G "%GENERATOR%" %CMAKE_OPTIONS% - "-DCMAKE_BUILD_TYPE=%configuration%" - "-DLLVM_PATH=C:\projects\deps\llvm" - -DLLVM_LIT_ARGS="-v --show-xfail --show-unsupported" - %APPVEYOR_BUILD_FOLDER% - - ############################################################################# - # Build Step - ############################################################################# - - "%MAKE_PROGRAM%" - -test_script: - - "%MAKE_PROGRAM% check-cxx" - -on_failure: - - appveyor PushArtifact CMakeFiles/CMakeOutput.log - - appveyor PushArtifact CMakeFiles/CMakeError.log - -artifacts: - - path: '_build/CMakeFiles/*.log' - name: logs - -cache: - - C:\projects\deps\ninja - - C:\projects\deps\cmake - - C:\projects\deps\llvm-installer.exe diff --git a/libcxx/cmake/caches/AMDGPU.cmake b/libcxx/cmake/caches/AMDGPU.cmake index c7d6afc854a54c7b24032fbb2861a61e38358b7a..1a6bfd85a50be3c77c664049bda0bbf1042fd945 100644 --- a/libcxx/cmake/caches/AMDGPU.cmake +++ b/libcxx/cmake/caches/AMDGPU.cmake @@ -29,7 +29,7 @@ set(LIBCXXABI_USE_LLVM_UNWINDER OFF CACHE BOOL "") # Necessary compile flags for AMDGPU. set(LIBCXX_ADDITIONAL_COMPILE_FLAGS - "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "") + "-nogpulib;-flto;-fconvergent-functions;SHELL:-Xclang -mcode-object-version=none" CACHE STRING "") set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS - "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "") + "-nogpulib;-flto;-fconvergent-functions;SHELL:-Xclang -mcode-object-version=none" CACHE STRING "") set(CMAKE_REQUIRED_FLAGS "-nogpulib" CACHE STRING "") diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst index 3a66aecaf57cb2d41a8fc70ad7e0fc42c3c8baf0..abd6764579e52afbdb9802214231170a4bd6d724 100644 --- a/libcxx/docs/ReleaseNotes/20.rst +++ b/libcxx/docs/ReleaseNotes/20.rst @@ -82,6 +82,9 @@ Deprecations and Removals were private but could cause ambiguity in name lookup. Code that expects such ambiguity will possibly not compile in LLVM 20. +- The function ``__libcpp_verbose_abort()`` is now ``noexcept``, to match ``std::terminate()``. (The combination of + ``noexcept`` and ``[[noreturn]]`` has special significance for function effects analysis.) + Upcoming Deprecations and Removals ---------------------------------- diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv index 63e4176ecba1d75c78a628d3fb93414262cc1eb8..cfa721230e5fb8e5e6f76754396177cb50be185c 100644 --- a/libcxx/docs/Status/Cxx23Issues.csv +++ b/libcxx/docs/Status/Cxx23Issues.csv @@ -168,7 +168,7 @@ "`LWG3672 `__","``common_iterator::operator->()`` should return by value","2022-07 (Virtual)","|Complete|","19.0","" "`LWG3683 `__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","2022-07 (Virtual)","|Complete|","20.0","" "`LWG3687 `__","``expected`` move constructor should move","2022-07 (Virtual)","|Complete|","16.0","" -"`LWG3692 `__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","","","" +"`LWG3692 `__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","|Complete|","20.0","" "`LWG3701 `__","Make ``formatter, charT>`` requirement explicit","2022-07 (Virtual)","|Complete|","15.0","" "`LWG3702 `__","Should ``zip_transform_view::iterator`` remove ``operator<``","2022-07 (Virtual)","","","" "`LWG3703 `__","Missing requirements for ``expected`` requires ``is_void``","2022-07 (Virtual)","|Complete|","16.0","" diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index da7b5881877135e0dfbf8e74a317f9d079f2d95c..c64f1c4171fce1ac29e2e2922915de10247faf78 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -60,7 +60,7 @@ "`P1642R11 `__","Freestanding ``[utilities]``, ``[ranges]``, and ``[iterators]``","2022-07 (Virtual)","","","" "`P1899R3 `__","``stride_view``","2022-07 (Virtual)","","","" "`P2093R14 `__","Formatted output","2022-07 (Virtual)","|Complete|","18.0","" -"`P2165R4 `__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","","","" +"`P2165R4 `__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","|Partial|","","Only the part for ``zip_view`` is implemented." "`P2278R4 `__","``cbegin`` should always return a constant iterator","2022-07 (Virtual)","","","" "`P2286R8 `__","Formatting Ranges","2022-07 (Virtual)","|Complete|","16.0","" "`P2291R3 `__","Add Constexpr Modifiers to Functions ``to_chars`` and ``from_chars`` for Integral Types in ```` Header","2022-07 (Virtual)","|Complete|","16.0","" diff --git a/libcxx/docs/VendorDocumentation.rst b/libcxx/docs/VendorDocumentation.rst index 3a3d1cdb1ea7ff89222d053889ea165569cd1fd5..3795381264c93b49cc99d3a29e24de66cce1f42d 100644 --- a/libcxx/docs/VendorDocumentation.rst +++ b/libcxx/docs/VendorDocumentation.rst @@ -213,11 +213,13 @@ General purpose options Output name for the shared libc++ runtime library. -.. option:: LIBCXX_ADDITIONAL_COMPILE_FLAGS:STRING +.. option:: {LIBCXX,LIBCXXABI,LIBUNWIND}_ADDITIONAL_COMPILE_FLAGS:STRING **Default**: ``""`` - Additional Compile only flags which can be provided in cache. + Additional compile flags to use when building the runtimes. This should be a CMake ``;``-delimited list of individual + compiler options to use. For options that must be passed as-is to the compiler without deduplication (e.g. + ``-Xclang -foo`` option groups), consider using ``SHELL:`` as `documented here `_. .. option:: LIBCXX_ADDITIONAL_LIBRARIES:STRING @@ -346,12 +348,6 @@ The following options allow building libc++ for a different ABI version. Build and use the LLVM unwinder. Note: This option can only be used when libc++abi is the C++ ABI library used. -.. option:: LIBCXXABI_ADDITIONAL_COMPILE_FLAGS:STRING - - **Default**: ``""`` - - Additional Compile only flags which can be provided in cache. - .. option:: LIBCXXABI_ADDITIONAL_LIBRARIES:STRING **Default**: ``""`` diff --git a/libcxx/include/__ranges/zip_view.h b/libcxx/include/__ranges/zip_view.h index fe3c87a9306fe9fde38c6c8dea2149dbddb29e92..835e23cb23af1be3fa1d538194d7140230b91d02 100644 --- a/libcxx/include/__ranges/zip_view.h +++ b/libcxx/include/__ranges/zip_view.h @@ -36,7 +36,6 @@ #include <__utility/forward.h> #include <__utility/integer_sequence.h> #include <__utility/move.h> -#include <__utility/pair.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -58,22 +57,11 @@ concept __zip_is_common = (!(bidirectional_range<_Ranges> && ...) && (common_range<_Ranges> && ...)) || ((random_access_range<_Ranges> && ...) && (sized_range<_Ranges> && ...)); -template -auto __tuple_or_pair_test() -> pair<_Tp, _Up>; - -template - requires(sizeof...(_Types) != 2) -auto __tuple_or_pair_test() -> tuple<_Types...>; - -template -using __tuple_or_pair = decltype(__tuple_or_pair_test<_Types...>()); - template _LIBCPP_HIDE_FROM_ABI constexpr auto __tuple_transform(_Fun&& __f, _Tuple&& __tuple) { return std::apply( [&](_Types&&... __elements) { - return __tuple_or_pair...>( - std::invoke(__f, std::forward<_Types>(__elements))...); + return tuple...>(std::invoke(__f, std::forward<_Types>(__elements))...); }, std::forward<_Tuple>(__tuple)); } @@ -88,7 +76,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __tuple_for_each(_Fun&& __f, _Tuple&& __tup } template -_LIBCPP_HIDE_FROM_ABI constexpr __tuple_or_pair< +_LIBCPP_HIDE_FROM_ABI constexpr tuple< invoke_result_t<_Fun&, typename tuple_element<_Indices, remove_cvref_t<_Tuple1>>::type, typename tuple_element<_Indices, remove_cvref_t<_Tuple2>>::type>...> @@ -250,10 +238,9 @@ template requires(view<_Views> && ...) && (sizeof...(_Views) > 0) template class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base<_Const, _Views...> { - __tuple_or_pair>...> __current_; + tuple>...> __current_; - _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator( - __tuple_or_pair>...> __current) + _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(tuple>...> __current) : __current_(std::move(__current)) {} template @@ -266,7 +253,7 @@ class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base public: using iterator_concept = decltype(__get_zip_view_iterator_tag<_Const, _Views...>()); - using value_type = __tuple_or_pair>...>; + using value_type = tuple>...>; using difference_type = common_type_t>...>; _LIBCPP_HIDE_FROM_ABI __iterator() = default; @@ -340,33 +327,8 @@ public: } } - _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<(const __iterator& __x, const __iterator& __y) - requires __zip_all_random_access<_Const, _Views...> - { - return __x.__current_ < __y.__current_; - } - - _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>(const __iterator& __x, const __iterator& __y) - requires __zip_all_random_access<_Const, _Views...> - { - return __y < __x; - } - - _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<=(const __iterator& __x, const __iterator& __y) - requires __zip_all_random_access<_Const, _Views...> - { - return !(__y < __x); - } - - _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>=(const __iterator& __x, const __iterator& __y) - requires __zip_all_random_access<_Const, _Views...> - { - return !(__x < __y); - } - _LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(const __iterator& __x, const __iterator& __y) - requires __zip_all_random_access<_Const, _Views...> && - (three_way_comparable>> && ...) + requires __zip_all_random_access<_Const, _Views...> { return __x.__current_ <=> __y.__current_; } @@ -427,10 +389,9 @@ template requires(view<_Views> && ...) && (sizeof...(_Views) > 0) template class zip_view<_Views...>::__sentinel { - __tuple_or_pair>...> __end_; + tuple>...> __end_; - _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel( - __tuple_or_pair>...> __end) + _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(tuple>...> __end) : __end_(__end) {} friend class zip_view<_Views...>; diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer index dfe552fbb45893bc5b7c481ce37006bea905f3e5..c4817601039f3bf199ad3413cab9b887ec7ced0e 100644 --- a/libcxx/include/__split_buffer +++ b/libcxx/include/__split_buffer @@ -80,9 +80,6 @@ public: pointer __end_; _LIBCPP_COMPRESSED_PAIR(pointer, __end_cap_, allocator_type, __alloc_); - using __alloc_ref = __add_lvalue_reference_t; - using __alloc_const_ref = __add_lvalue_reference_t; - __split_buffer(const __split_buffer&) = delete; __split_buffer& operator=(const __split_buffer&) = delete; diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort index 244278aec652d2f98833b68a735a1168fabf8b82..73295cae42610292c16da4b70bcd972786e8d1b2 100644 --- a/libcxx/include/__verbose_abort +++ b/libcxx/include/__verbose_abort @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // This function should never be called directly from the code -- it should only be called through // the _LIBCPP_VERBOSE_ABORT macro. [[__noreturn__]] _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS -_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...); +_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...) _NOEXCEPT; // _LIBCPP_VERBOSE_ABORT(format, args...) // diff --git a/libcxx/include/future b/libcxx/include/future index dfa373d6593c7909187e22633ad9af938a4bd428..f16f4234c48966040d74a2a78e2e864ac4266901 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -594,7 +594,7 @@ public: _LIBCPP_HIDE_FROM_ABI void set_value_at_thread_exit(_Arg&& __arg); _LIBCPP_HIDE_FROM_ABI _Rp move(); - _LIBCPP_HIDE_FROM_ABI __add_lvalue_reference_t<_Rp> copy(); + _LIBCPP_HIDE_FROM_ABI _Rp& copy(); }; template @@ -636,7 +636,7 @@ _Rp __assoc_state<_Rp>::move() { } template -__add_lvalue_reference_t<_Rp> __assoc_state<_Rp>::copy() { +_Rp& __assoc_state<_Rp>::copy() { unique_lock __lk(this->__mut_); this->__sub_wait(__lk); if (this->__exception_ != nullptr) diff --git a/libcxx/include/new b/libcxx/include/new index 2947ee179510a5af26ebf28a2d0914b6318be860..75e2b8742df6bdef588597a41661e2546fd790c4 100644 --- a/libcxx/include/new +++ b/libcxx/include/new @@ -281,7 +281,7 @@ _LIBCPP_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args) { } template -_LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) { +_LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) _NOEXCEPT { #if __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete) __builtin_operator_delete(__args...); #else @@ -302,7 +302,7 @@ inline _LIBCPP_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __ali } template -_LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args) { +_LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args) _NOEXCEPT { #if !_LIBCPP_HAS_SIZED_DEALLOCATION (void)__size; return std::__libcpp_operator_delete(__ptr, __args...); @@ -311,7 +311,7 @@ _LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __siz #endif } -inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) { +inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) _NOEXCEPT { #if !_LIBCPP_HAS_ALIGNED_ALLOCATION (void)__align; return __do_deallocate_handle_size(__ptr, __size); @@ -325,7 +325,7 @@ inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size #endif } -inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) { +inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) _NOEXCEPT { #if !_LIBCPP_HAS_ALIGNED_ALLOCATION (void)__align; return __libcpp_operator_delete(__ptr); diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp index 719134e2ae554d6b27de4ec983cd41aa5ac494fa..0019063405a81039a830c4b5454cc78f3816d506 100644 --- a/libcxx/src/verbose_abort.cpp +++ b/libcxx/src/verbose_abort.cpp @@ -28,7 +28,7 @@ extern "C" void android_set_abort_message(const char* msg); _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) { +_LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) noexcept { // Write message to stderr. We do this before formatting into a // buffer so that we still get some information out if that fails. { diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in index bc9bb0e03911d4f9526510e8256a835970151cc1..139a6cafa2cfb3b5c69b5d67f21bc9a9e43a61f4 100644 --- a/libcxx/test/configs/cmake-bridge.cfg.in +++ b/libcxx/test/configs/cmake-bridge.cfg.in @@ -20,7 +20,7 @@ config.name = os.path.basename('@LIBCXX_TEST_CONFIG@') config.test_source_root = os.path.join('@LIBCXX_SOURCE_DIR@', 'test') config.test_format = libcxx.test.format.CxxStandardLibraryTest() config.recursiveExpansionLimit = 10 -config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test') +config.test_exec_root = os.path.join('@LIBCXX_BINARY_DIR@', 'test') # Add substitutions for bootstrapping the test suite configuration config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@')) diff --git a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp index 9298a1e365fca427571cdb06e4c16112f6a51ca3..21e9003c30b70400eea78e1be35d1d485aa22c85 100644 --- a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp +++ b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp @@ -15,9 +15,7 @@ #include <__verbose_abort> #include -void std::__libcpp_verbose_abort(char const*, ...) { - std::exit(EXIT_SUCCESS); -} +void std::__libcpp_verbose_abort(char const*, ...) _NOEXCEPT { std::exit(EXIT_SUCCESS); } int main(int, char**) { std::__libcpp_verbose_abort("%s", "message"); diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp index ea5953cefa0ff31445712a75eb791b97387ca47c..bdfd58ff8bbe78f1a69ccb62c45c07999c61e91b 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp @@ -63,11 +63,7 @@ constexpr bool test() { std::ranges::zip_view>> decltype(auto) v2 = std::views::zip(v); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v, std::tuple>>); -#else static_assert(std::is_same_v, std::tuple>>); -#endif } return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp index f53289621eabba78673e1b14c20eff4f752e6a2c..fdfcc02a8fb1c1e53e3c5cc04395549dbf18c2da 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp @@ -49,12 +49,8 @@ constexpr bool test() { using View = std::ranges::zip_view; View v = View(); // the default constructor is not explicit assert(v.size() == 3); - auto it = v.begin(); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - using Value = std::pair; -#else + auto it = v.begin(); using Value = std::tuple; -#endif assert(*it++ == Value(buff[0], buff[0])); assert(*it++ == Value(buff[1], buff[1])); assert(*it == Value(buff[2], buff[2])); diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp index ed1cb0ccebd2b56be2cc4306709bec31ecf411d9..8ab7346800093ee23e4fb69ad6020a18e1ff9c2e 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp @@ -10,17 +10,8 @@ // friend constexpr bool operator==(const iterator& x, const iterator& y) // requires (equality_comparable>> && ...); -// friend constexpr bool operator<(const iterator& x, const iterator& y) -// requires all-random-access; -// friend constexpr bool operator>(const iterator& x, const iterator& y) -// requires all-random-access; -// friend constexpr bool operator<=(const iterator& x, const iterator& y) -// requires all-random-access; -// friend constexpr bool operator>=(const iterator& x, const iterator& y) -// requires all-random-access; // friend constexpr auto operator<=>(const iterator& x, const iterator& y) -// requires all-random-access && -// (three_way_comparable>> && ...); +// requires all-random-access; #include #include @@ -165,12 +156,7 @@ constexpr bool test() { using Subrange = std::ranges::subrange; static_assert(!std::three_way_comparable); using R = std::ranges::zip_view; -#ifdef _LIBCPP_VERSION - // libc++ hasn't implemented LWG-3692 "zip_view::iterator's operator<=> is overconstrained" - static_assert(!std::three_way_comparable>); -#else static_assert(std::three_way_comparable>); -#endif int a[] = {1, 2, 3, 4}; int b[] = {5, 6, 7, 8, 9}; diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp index 569d0409721941d0b5e27317ecd594efce2fab2e..fb58aa28fbdf8d841e32b211f7ce3d01ef921252 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp @@ -42,11 +42,7 @@ constexpr bool test() { auto [x, y] = *it; assert(&x == &(a[0])); assert(&y == &(b[0])); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v>); -#else static_assert(std::is_same_v>); -#endif x = 5; y = 0.1; @@ -70,11 +66,7 @@ constexpr bool test() { auto it = v.begin(); assert(&(std::get<0>(*it)) == &(a[0])); assert(&(std::get<1>(*it)) == &(a[0])); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v>); -#else static_assert(std::is_same_v>); -#endif } return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp index c19f6c2b16524fa6c2871587c6f8a9250102eb99..2f2f0fc4f4e3315d239feb4efc13bc308ec7686c 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp @@ -65,7 +65,7 @@ struct ConstVeryDifferentRange { void test() { int buffer[] = {1, 2, 3, 4}; { - // 2 views should have pair value_type + // 2 views should have 2-tuple value_type // random_access_iterator_tag std::ranges::zip_view v(buffer, buffer); using Iter = decltype(v.begin()); @@ -73,11 +73,7 @@ void test() { static_assert(std::is_same_v); static_assert(std::is_same_v); static_assert(std::is_same_v); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v>); -#else static_assert(std::is_same_v>); -#endif static_assert(HasIterCategory); } @@ -124,11 +120,7 @@ void test() { static_assert(std::is_same_v); static_assert(std::is_same_v); static_assert(std::is_same_v); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v>>); -#else static_assert(std::is_same_v>>); -#endif static_assert(HasIterCategory); } @@ -169,11 +161,7 @@ void test() { // value_type of multiple views with different value_type std::ranges::zip_view v{foos, bars}; using Iter = decltype(v.begin()); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v>); -#else static_assert(std::is_same_v>); -#endif } { diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp index 1538d763205db1228e72628fda754aca2df8a873..ba3abfa2a436953305821013c3cc3637605df104 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp @@ -27,11 +27,7 @@ constexpr bool test() { assert(it[2] == *(it + 2)); assert(it[4] == *(it + 4)); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v>); -#else static_assert(std::is_same_v>); -#endif } { @@ -42,11 +38,7 @@ constexpr bool test() { assert(it[2] == *(it + 2)); assert(it[4] == *(it + 4)); -#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet - static_assert(std::is_same_v>); -#else static_assert(std::is_same_v>); -#endif } { diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h index 47ebfeeeefc0f4feb065785bbe9441ccef3e99d0..a279400d651b48a5a3c73c8e105eca67fdef0d8c 100644 --- a/libcxx/test/support/check_assertion.h +++ b/libcxx/test/support/check_assertion.h @@ -334,7 +334,7 @@ private: }; #ifdef _LIBCPP_VERSION -void std::__libcpp_verbose_abort(char const* format, ...) { +void std::__libcpp_verbose_abort(char const* format, ...) noexcept { va_list args; va_start(args, format); diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt index ac1ee69d5f11c93013d28c7d29b30f42ff801ef9..da0e8b286cddc1429023bbad8fac6b39b066acde 100644 --- a/libcxxabi/CMakeLists.txt +++ b/libcxxabi/CMakeLists.txt @@ -222,8 +222,7 @@ set(LIBCXXABI_CXX_FLAGS "") set(LIBCXXABI_COMPILE_FLAGS "") set(LIBCXXABI_LINK_FLAGS "") set(LIBCXXABI_LIBRARIES "") -set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING - "Additional Compile only flags which can be provided in cache") +set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING "See documentation LIBCXX_ADDITIONAL_COMPILE_FLAGS") set(LIBCXXABI_ADDITIONAL_LIBRARIES "" CACHE STRING "Additional libraries libc++abi is linked to which can be provided in cache") diff --git a/libcxxabi/test/configs/cmake-bridge.cfg.in b/libcxxabi/test/configs/cmake-bridge.cfg.in index b00eb642750c9b6bf963ba46b7493b51faecbdd6..1dd6b3367e4387739d293623ed11cfed4504bb8f 100644 --- a/libcxxabi/test/configs/cmake-bridge.cfg.in +++ b/libcxxabi/test/configs/cmake-bridge.cfg.in @@ -21,7 +21,7 @@ config.name = os.path.basename('@LIBCXXABI_TEST_CONFIG@') config.test_source_root = os.path.join('@LIBCXXABI_SOURCE_DIR@', 'test') config.test_format = libcxx.test.format.CxxStandardLibraryTest() config.recursiveExpansionLimit = 10 -config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test') +config.test_exec_root = os.path.join('@LIBCXXABI_BINARY_DIR@', 'test') # TODO: This is a non-standard Lit attribute and we should have another way of accessing this. config.host_triple = '@LLVM_HOST_TRIPLE@' diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt index b911f482fc26b23d3b6ca8ab6af09aced39f14fc..ea06dc8a67b949cd9a06ff6eb876c288e0502d56 100644 --- a/libunwind/CMakeLists.txt +++ b/libunwind/CMakeLists.txt @@ -162,8 +162,7 @@ set(LIBUNWIND_C_FLAGS "") set(LIBUNWIND_CXX_FLAGS "") set(LIBUNWIND_COMPILE_FLAGS "") set(LIBUNWIND_LINK_FLAGS "") -set(LIBUNWIND_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING - "Additional Compile only flags which can be provided in cache") +set(LIBUNWIND_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING "See documentation for LIBCXX_ADDITIONAL_COMPILE_FLAGS") set(LIBUNWIND_ADDITIONAL_LIBRARIES "" CACHE STRING "Additional libraries libunwind is linked to which can be provided in cache") diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index ce6dced535e781bb2cdc662479e7549fee366bec..2a3aba28fb6ca50b34ba6211d3b9e5fe718903ed 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -2033,7 +2033,6 @@ typedef _Unwind_Reason_Code __xlcxx_personality_v0_t(int, _Unwind_Action, uint64_t, _Unwind_Exception *, struct _Unwind_Context *); -__attribute__((__weak__)) __xlcxx_personality_v0_t __xlcxx_personality_v0; } static __xlcxx_personality_v0_t *xlcPersonalityV0; @@ -2126,42 +2125,35 @@ bool UnwindCursor::getInfoFromTBTable(pint_t pc, R ®isters) { // function __xlcxx_personality_v0(), which is the personality for the state // table and is exported from libc++abi, is directly assigned as the // handler here. When a legacy XLC++ frame is encountered, the symbol - // is resolved dynamically using dlopen() to avoid hard dependency from - // libunwind on libc++abi. + // is resolved dynamically using dlopen() to avoid a hard dependency of + // libunwind on libc++abi in cases such as non-C++ applications. // Resolve the function pointer to the state table personality if it has - // not already. + // not already been done. if (xlcPersonalityV0 == NULL) { xlcPersonalityV0InitLock.lock(); if (xlcPersonalityV0 == NULL) { - // If libc++abi is statically linked in, symbol __xlcxx_personality_v0 - // has been resolved at the link time. - xlcPersonalityV0 = &__xlcxx_personality_v0; + // Resolve __xlcxx_personality_v0 using dlopen(). + const char *libcxxabi = "libc++abi.a(libc++abi.so.1)"; + void *libHandle; + // The AIX dlopen() sets errno to 0 when it is successful, which + // clobbers the value of errno from the user code. This is an AIX + // bug because according to POSIX it should not set errno to 0. To + // workaround before AIX fixes the bug, errno is saved and restored. + int saveErrno = errno; + libHandle = dlopen(libcxxabi, RTLD_MEMBER | RTLD_NOW); + if (libHandle == NULL) { + _LIBUNWIND_TRACE_UNWINDING("dlopen() failed with errno=%d\n", errno); + assert(0 && "dlopen() failed"); + } + xlcPersonalityV0 = reinterpret_cast<__xlcxx_personality_v0_t *>( + dlsym(libHandle, "__xlcxx_personality_v0")); if (xlcPersonalityV0 == NULL) { - // libc++abi is dynamically linked. Resolve __xlcxx_personality_v0 - // using dlopen(). - const char libcxxabi[] = "libc++abi.a(libc++abi.so.1)"; - void *libHandle; - // The AIX dlopen() sets errno to 0 when it is successful, which - // clobbers the value of errno from the user code. This is an AIX - // bug because according to POSIX it should not set errno to 0. To - // workaround before AIX fixes the bug, errno is saved and restored. - int saveErrno = errno; - libHandle = dlopen(libcxxabi, RTLD_MEMBER | RTLD_NOW); - if (libHandle == NULL) { - _LIBUNWIND_TRACE_UNWINDING("dlopen() failed with errno=%d\n", - errno); - assert(0 && "dlopen() failed"); - } - xlcPersonalityV0 = reinterpret_cast<__xlcxx_personality_v0_t *>( - dlsym(libHandle, "__xlcxx_personality_v0")); - if (xlcPersonalityV0 == NULL) { - _LIBUNWIND_TRACE_UNWINDING("dlsym() failed with errno=%d\n", errno); - assert(0 && "dlsym() failed"); - } - dlclose(libHandle); - errno = saveErrno; + _LIBUNWIND_TRACE_UNWINDING("dlsym() failed with errno=%d\n", errno); + assert(0 && "dlsym() failed"); } + dlclose(libHandle); + errno = saveErrno; } xlcPersonalityV0InitLock.unlock(); } diff --git a/libunwind/test/aix_runtime_link.pass.cpp b/libunwind/test/aix_runtime_link.pass.cpp new file mode 100644 index 0000000000000000000000000000000000000000..deb192c07981eb0c76794e452eba0337d20fab35 --- /dev/null +++ b/libunwind/test/aix_runtime_link.pass.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Test that libunwind loads successfully independently of libc++abi with +// runtime linking on AIX. + +// REQUIRES: target={{.+}}-aix{{.*}} +// ADDITIONAL_COMPILE_FLAGS: -Wl,-brtl + +#include +extern "C" int printf(const char *, ...); +int main(void) { + void *fp = (void *)&_Unwind_Backtrace; + printf("%p\n", fp); +} diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in index f627f401f9f7b5eca7a399a26f961ca04e3653f4..20b61e788ab180b3a3ad3a4c3f8dd7cb02d1e946 100644 --- a/libunwind/test/configs/cmake-bridge.cfg.in +++ b/libunwind/test/configs/cmake-bridge.cfg.in @@ -20,7 +20,7 @@ config.name = os.path.basename('@LIBUNWIND_TEST_CONFIG@') config.test_source_root = os.path.join('@LIBUNWIND_SOURCE_DIR@', 'test') config.test_format = libcxx.test.format.CxxStandardLibraryTest() config.recursiveExpansionLimit = 10 -config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test') +config.test_exec_root = os.path.join('@LIBUNWIND_BINARY_DIR@', 'test') # Add a few features that are common to all the configurations if @LIBUNWIND_USES_ARM_EHABI@: diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 85a58a3677181aec45f70e8ebe2e7fc8303112f4..12e1ae6281127099a188bd1a110da33393443c09 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1340,7 +1340,7 @@ void LinkerDriver::maybeCreateECExportThunk(StringRef name, Symbol *&sym) { if (!sym) return; if (auto undef = dyn_cast(sym)) - def = undef->getWeakAlias(); + def = undef->getDefinedWeakAlias(); else def = dyn_cast(sym); if (!def) @@ -1376,7 +1376,7 @@ void LinkerDriver::createECExportThunks() { continue; Defined *targetSym; if (auto undef = dyn_cast(sym)) - targetSym = undef->getWeakAlias(); + targetSym = undef->getDefinedWeakAlias(); else targetSym = dyn_cast(sym); if (!targetSym) diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp index 567c2b93776c94f60755860172ecc55d1a7b934c..89f2da02bdcf42ce52fff4e086337b91a629eb70 100644 --- a/lld/COFF/Symbols.cpp +++ b/lld/COFF/Symbols.cpp @@ -112,12 +112,12 @@ DefinedImportThunk::DefinedImportThunk(COFFLinkerContext &ctx, StringRef name, ImportThunkChunk *chunk) : Defined(DefinedImportThunkKind, name), wrappedSym(s), data(chunk) {} -Defined *Undefined::getWeakAlias() { +Symbol *Undefined::getWeakAlias() { // A weak alias may be a weak alias to another symbol, so check recursively. DenseSet weakChain; for (Symbol *a = weakAlias; a; a = cast(a)->weakAlias) { - if (auto *d = dyn_cast(a)) - return d; + if (!isa(a)) + return a; if (!weakChain.insert(a).second) break; // We have a cycle. } @@ -125,7 +125,7 @@ Defined *Undefined::getWeakAlias() { } bool Undefined::resolveWeakAlias() { - Defined *d = getWeakAlias(); + Defined *d = getDefinedWeakAlias(); if (!d) return false; diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h index 9b21e09bf83a42fe1202eabda281e349d3f81302..a898ebf05fd8093d0a76d9a850c40646f339b64d 100644 --- a/lld/COFF/Symbols.h +++ b/lld/COFF/Symbols.h @@ -340,7 +340,10 @@ public: // If this symbol is external weak, try to resolve it to a defined // symbol by searching the chain of fallback symbols. Returns the symbol if // successful, otherwise returns null. - Defined *getWeakAlias(); + Symbol *getWeakAlias(); + Defined *getDefinedWeakAlias() { + return dyn_cast_or_null(getWeakAlias()); + } // If this symbol is external weak, replace this object with aliased symbol. bool resolveWeakAlias(); diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index afacbbfe9099cc80ffc4f23f0ebdb18ca80f62cd..fb77e67e9fc5cabef3164f3010e744dd3a99d765 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -2471,7 +2471,7 @@ static void readSymbolPartitionSection(Ctx &ctx, InputSectionBase *s) { StringRef partName = reinterpret_cast(s->content().data()); for (Partition &part : ctx.partitions) { if (part.name == partName) { - sym->partition = part.getNumber(); + sym->partition = part.getNumber(ctx); return; } } @@ -2500,7 +2500,7 @@ static void readSymbolPartitionSection(Ctx &ctx, InputSectionBase *s) { ctx.partitions.emplace_back(ctx); Partition &newPart = ctx.partitions.back(); newPart.name = partName; - sym->partition = newPart.getNumber(); + sym->partition = newPart.getNumber(ctx); } static void markBuffersAsDontNeed(Ctx &ctx, bool skipLinkedOutput) { diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index 1a5bc629d8b093adeabf0941cf5f4dd398e017e9..dfcc7c8bc852a97f5121c86ab0a7f6b4c303c74f 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -89,7 +89,7 @@ public: // The 1-indexed partition that this section is assigned to by the garbage // collector, or 0 if this section is dead. Normally there is only one // partition, so this will either be 0 or 1. - elf::Partition &getPartition() const; + elf::Partition &getPartition(Ctx &) const; // These corresponds to the fields in Elf_Shdr. uint64_t flags; diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 73c19c8385a824d5fbfbae120a52904236c1b36e..0188d658f9210e773842de0b60422961739b2605 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -878,7 +878,7 @@ template static void addRelativeReloc(Ctx &ctx, InputSectionBase &isec, uint64_t offsetInSec, Symbol &sym, int64_t addend, RelExpr expr, RelType type) { - Partition &part = isec.getPartition(); + Partition &part = isec.getPartition(ctx); if (sym.isTagged()) { std::lock_guard lock(relocMutex); @@ -1159,7 +1159,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, if (ctx.arg.emachine == EM_MIPS && rel == ctx.target->symbolicRel) rel = ctx.target->relativeRel; std::lock_guard lock(relocMutex); - Partition &part = sec->getPartition(); + Partition &part = sec->getPartition(ctx); if (ctx.arg.emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64) { // For a preemptible symbol, we can't use a relative relocation. For an // undefined symbol, we can't compute offset at link-time and use a diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp index 02814e0413909447a196a7a2bbfd15b9b759c6b8..674b1ef983f8430e33299e1799b3c812a9fd3623 100644 --- a/lld/ELF/SymbolTable.cpp +++ b/lld/ELF/SymbolTable.cpp @@ -337,7 +337,7 @@ void SymbolTable::scanVersionScript() { globalAsteriskFound = !isLocal; } } - assignWildcard(pat, isLocal ? VER_NDX_LOCAL : ver->id, ver->name); + assignWildcard(pat, isLocal ? (uint16_t)VER_NDX_LOCAL : ver->id, ver->name); }; for (VersionDefinition &v : llvm::reverse(ctx.arg.versionDefinitions)) { for (SymbolVersion &pat : v.nonLocalPatterns) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index d99aeac50ca5ce5c258f83106de4b8460b22725b..e18e7a32df86c743dcfa744cd2f46e7ac572d19e 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -563,7 +563,7 @@ SmallVector EhFrameSection::getFdeData() const { uint8_t *buf = ctx.bufferStart + getParent()->offset + outSecOff; SmallVector ret; - uint64_t va = getPartition().ehFrameHdr->getVA(); + uint64_t va = getPartition(ctx).ehFrameHdr->getVA(); for (CieRecord *rec : cieRecords) { uint8_t enc = getFdeEncoding(rec->cie); for (EhSectionPiece *fde : rec->fdes) { @@ -650,8 +650,8 @@ void EhFrameSection::writeTo(uint8_t *buf) { for (EhInputSection *s : sections) ctx.target->relocateAlloc(*s, buf); - if (getPartition().ehFrameHdr && getPartition().ehFrameHdr->getParent()) - getPartition().ehFrameHdr->write(); + if (getPartition(ctx).ehFrameHdr && getPartition(ctx).ehFrameHdr->getParent()) + getPartition(ctx).ehFrameHdr->write(); } GotSection::GotSection(Ctx &ctx) @@ -1325,7 +1325,7 @@ static uint64_t addPltRelSz(Ctx &ctx) { return ctx.in.relaPlt->getSize(); } template std::vector> DynamicSection::computeContents() { - elf::Partition &part = getPartition(); + elf::Partition &part = getPartition(ctx); bool isMain = part.name.empty(); std::vector> entries; @@ -1586,7 +1586,7 @@ DynamicSection::computeContents() { } template void DynamicSection::finalizeContents() { - if (OutputSection *sec = getPartition().dynStrTab->getParent()) + if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent()) getParent()->link = sec->sectionIndex; this->size = computeContents().size() * this->entsize; } @@ -1688,7 +1688,7 @@ void RelocationBaseSection::partitionRels() { } void RelocationBaseSection::finalizeContents() { - SymbolTableBaseSection *symTab = getPartition().dynSymTab.get(); + SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get(); // When linking glibc statically, .rel{,a}.plt contains R_*_IRELATIVE // relocations due to IFUNC (e.g. strcpy). sh_link will be set to 0 in that @@ -1712,7 +1712,7 @@ void DynamicReloc::computeRaw(Ctx &ctx, SymbolTableBaseSection *symt) { } void RelocationBaseSection::computeRels() { - SymbolTableBaseSection *symTab = getPartition().dynSymTab.get(); + SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get(); parallelForEach(relocs, [&ctx = ctx, symTab](DynamicReloc &rel) { rel.computeRaw(ctx, symTab); }); @@ -1852,7 +1852,7 @@ bool AndroidPackedRelocationSection::updateAllocSize(Ctx &ctx) { for (const DynamicReloc &rel : relocs) { Elf_Rela r; r.r_offset = rel.getOffset(); - r.setSymbolAndType(rel.getSymIndex(getPartition().dynSymTab.get()), + r.setSymbolAndType(rel.getSymIndex(getPartition(ctx).dynSymTab.get()), rel.type, false); r.r_addend = ctx.arg.isRela ? rel.computeAddend(ctx) : 0; @@ -2162,9 +2162,9 @@ void SymbolTableBaseSection::finalizeContents() { // Because the first symbol entry is a null entry, 1 is the first. getParent()->info = 1; - if (getPartition().gnuHashTab) { + if (getPartition(ctx).gnuHashTab) { // NB: It also sorts Symbols to meet the GNU hash table requirements. - getPartition().gnuHashTab->addSymbols(symbols); + getPartition(ctx).gnuHashTab->addSymbols(symbols); } else if (ctx.arg.emachine == EM_MIPS) { sortMipsSymbols(ctx, symbols); } @@ -2416,7 +2416,7 @@ GnuHashTableSection::GnuHashTableSection(Ctx &ctx) ".gnu.hash") {} void GnuHashTableSection::finalizeContents() { - if (OutputSection *sec = getPartition().dynSymTab->getParent()) + if (OutputSection *sec = getPartition(ctx).dynSymTab->getParent()) getParent()->link = sec->sectionIndex; // Computes bloom filter size in word size. We want to allocate 12 @@ -2438,7 +2438,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) { // Write a header. write32(ctx, buf, nBuckets); write32(ctx, buf + 4, - getPartition().dynSymTab->getNumSymbols() - symbols.size()); + getPartition(ctx).dynSymTab->getNumSymbols() - symbols.size()); write32(ctx, buf + 8, maskWords); write32(ctx, buf + 12, Shift2); buf += 16; @@ -2474,7 +2474,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) { // Write a hash bucket. Hash buckets contain indices in the following hash // value table. write32(ctx, buckets + i->bucketIdx, - getPartition().dynSymTab->getSymbolIndex(*i->sym)); + getPartition(ctx).dynSymTab->getSymbolIndex(*i->sym)); oldBucket = i->bucketIdx; } } @@ -2527,7 +2527,7 @@ HashTableSection::HashTableSection(Ctx &ctx) } void HashTableSection::finalizeContents() { - SymbolTableBaseSection *symTab = getPartition().dynSymTab.get(); + SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get(); if (OutputSection *sec = symTab->getParent()) getParent()->link = sec->sectionIndex; @@ -2541,7 +2541,7 @@ void HashTableSection::finalizeContents() { } void HashTableSection::writeTo(uint8_t *buf) { - SymbolTableBaseSection *symTab = getPartition().dynSymTab.get(); + SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get(); unsigned numSymbols = symTab->getNumSymbols(); uint32_t *p = reinterpret_cast(buf); @@ -3667,14 +3667,14 @@ void EhFrameHeader::writeTo(uint8_t *buf) { void EhFrameHeader::write() { uint8_t *buf = ctx.bufferStart + getParent()->offset + outSecOff; using FdeData = EhFrameSection::FdeData; - SmallVector fdes = getPartition().ehFrame->getFdeData(); + SmallVector fdes = getPartition(ctx).ehFrame->getFdeData(); buf[0] = 1; buf[1] = DW_EH_PE_pcrel | DW_EH_PE_sdata4; buf[2] = DW_EH_PE_udata4; buf[3] = DW_EH_PE_datarel | DW_EH_PE_sdata4; write32(ctx, buf + 4, - getPartition().ehFrame->getParent()->addr - this->getVA() - 4); + getPartition(ctx).ehFrame->getParent()->addr - this->getVA() - 4); write32(ctx, buf + 8, fdes.size()); buf += 12; @@ -3687,11 +3687,11 @@ void EhFrameHeader::write() { size_t EhFrameHeader::getSize() const { // .eh_frame_hdr has a 12 bytes header followed by an array of FDEs. - return 12 + getPartition().ehFrame->numFdes * 8; + return 12 + getPartition(ctx).ehFrame->numFdes * 8; } bool EhFrameHeader::isNeeded() const { - return isLive() && getPartition().ehFrame->isNeeded(); + return isLive() && getPartition(ctx).ehFrame->isNeeded(); } VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx) @@ -3699,19 +3699,19 @@ VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx) ".gnu.version_d") {} StringRef VersionDefinitionSection::getFileDefName() { - if (!getPartition().name.empty()) - return getPartition().name; + if (!getPartition(ctx).name.empty()) + return getPartition(ctx).name; if (!ctx.arg.soName.empty()) return ctx.arg.soName; return ctx.arg.outputFile; } void VersionDefinitionSection::finalizeContents() { - fileDefNameOff = getPartition().dynStrTab->addString(getFileDefName()); + fileDefNameOff = getPartition(ctx).dynStrTab->addString(getFileDefName()); for (const VersionDefinition &v : namedVersionDefs(ctx)) - verDefNameOffs.push_back(getPartition().dynStrTab->addString(v.name)); + verDefNameOffs.push_back(getPartition(ctx).dynStrTab->addString(v.name)); - if (OutputSection *sec = getPartition().dynStrTab->getParent()) + if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent()) getParent()->link = sec->sectionIndex; // sh_info should be set to the number of definitions. This fact is missed in @@ -3765,16 +3765,16 @@ VersionTableSection::VersionTableSection(Ctx &ctx) void VersionTableSection::finalizeContents() { // At the moment of june 2016 GNU docs does not mention that sh_link field // should be set, but Sun docs do. Also readelf relies on this field. - getParent()->link = getPartition().dynSymTab->getParent()->sectionIndex; + getParent()->link = getPartition(ctx).dynSymTab->getParent()->sectionIndex; } size_t VersionTableSection::getSize() const { - return (getPartition().dynSymTab->getSymbols().size() + 1) * 2; + return (getPartition(ctx).dynSymTab->getSymbols().size() + 1) * 2; } void VersionTableSection::writeTo(uint8_t *buf) { buf += 2; - for (const SymbolTableEntry &s : getPartition().dynSymTab->getSymbols()) { + for (const SymbolTableEntry &s : getPartition(ctx).dynSymTab->getSymbols()) { // For an unextracted lazy symbol (undefined weak), it must have been // converted to Undefined and have VER_NDX_GLOBAL version here. assert(!s.sym->isLazy()); @@ -3785,7 +3785,7 @@ void VersionTableSection::writeTo(uint8_t *buf) { bool VersionTableSection::isNeeded() const { return isLive() && - (getPartition().verDef || getPartition().verNeed->isNeeded()); + (getPartition(ctx).verDef || getPartition(ctx).verNeed->isNeeded()); } void elf::addVerneed(Ctx &ctx, Symbol &ss) { @@ -3817,7 +3817,7 @@ template void VersionNeedSection::finalizeContents() { continue; verneeds.emplace_back(); Verneed &vn = verneeds.back(); - vn.nameStrTab = getPartition().dynStrTab->addString(f->soName); + vn.nameStrTab = getPartition(ctx).dynStrTab->addString(f->soName); bool isLibc = ctx.arg.relrGlibc && f->soName.starts_with("libc.so."); bool isGlibc2 = false; for (unsigned i = 0; i != f->vernauxs.size(); ++i) { @@ -3829,17 +3829,17 @@ template void VersionNeedSection::finalizeContents() { if (isLibc && ver.starts_with("GLIBC_2.")) isGlibc2 = true; vn.vernauxs.push_back({verdef->vd_hash, f->vernauxs[i], - getPartition().dynStrTab->addString(ver)}); + getPartition(ctx).dynStrTab->addString(ver)}); } if (isGlibc2) { const char *ver = "GLIBC_ABI_DT_RELR"; vn.vernauxs.push_back({hashSysV(ver), ++SharedFile::vernauxNum + getVerDefNum(ctx), - getPartition().dynStrTab->addString(ver)}); + getPartition(ctx).dynStrTab->addString(ver)}); } } - if (OutputSection *sec = getPartition().dynStrTab->getParent()) + if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent()) getParent()->link = sec->sectionIndex; getParent()->info = verneeds.size(); } @@ -3995,7 +3995,7 @@ template void elf::splitSections(Ctx &ctx) { void elf::combineEhSections(Ctx &ctx) { llvm::TimeTraceScope timeScope("Combine EH sections"); for (EhInputSection *sec : ctx.ehInputSections) { - EhFrameSection &eh = *sec->getPartition().ehFrame; + EhFrameSection &eh = *sec->getPartition(ctx).ehFrame; sec->parent = &eh; eh.addralign = std::max(eh.addralign, sec->addralign); eh.sections.push_back(sec); @@ -4004,12 +4004,12 @@ void elf::combineEhSections(Ctx &ctx) { if (!ctx.mainPart->armExidx) return; - llvm::erase_if(ctx.inputSections, [](InputSectionBase *s) { + llvm::erase_if(ctx.inputSections, [&](InputSectionBase *s) { // Ignore dead sections and the partition end marker (.part.end), // whose partition number is out of bounds. if (!s->isLive() || s->partition == 255) return false; - Partition &part = s->getPartition(); + Partition &part = s->getPartition(ctx); return s->kind() == SectionBase::Regular && part.armExidx && part.armExidx->addSection(cast(s)); }); @@ -4447,7 +4447,7 @@ size_t PartitionElfHeaderSection::getSize() const { template void PartitionElfHeaderSection::writeTo(uint8_t *buf) { - writeEhdr(ctx, buf, getPartition()); + writeEhdr(ctx, buf, getPartition(ctx)); // Loadable partitions are always ET_DYN. auto *eHdr = reinterpret_cast(buf); @@ -4460,12 +4460,12 @@ PartitionProgramHeadersSection::PartitionProgramHeadersSection(Ctx &ctx) template size_t PartitionProgramHeadersSection::getSize() const { - return sizeof(typename ELFT::Phdr) * getPartition().phdrs.size(); + return sizeof(typename ELFT::Phdr) * getPartition(ctx).phdrs.size(); } template void PartitionProgramHeadersSection::writeTo(uint8_t *buf) { - writePhdrs(buf, getPartition()); + writePhdrs(buf, getPartition(ctx)); } PartitionIndexSection::PartitionIndexSection(Ctx &ctx) @@ -4747,7 +4747,7 @@ template void elf::createSyntheticSections(Ctx &ctx) { const unsigned threadCount = ctx.arg.threadCount; for (Partition &part : ctx.partitions) { auto add = [&](SyntheticSection &sec) { - sec.partition = part.getNumber(); + sec.partition = part.getNumber(ctx); ctx.inputSections.push_back(&sec); }; diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index 7ddbaff5573fdbd0dc38cc2c389c50962f77f5c1..d64c4aad8c552b39fd4e39507911ad577a9a4adb 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -1475,10 +1475,10 @@ struct Partition { std::unique_ptr verSym; Partition(Ctx &ctx) : ctx(ctx) {} - unsigned getNumber() const { return this - &ctx.partitions[0] + 1; } + unsigned getNumber(Ctx &ctx) const { return this - &ctx.partitions[0] + 1; } }; -inline Partition &SectionBase::getPartition() const { +inline Partition &SectionBase::getPartition(Ctx &ctx) const { assert(isLive()); return ctx.partitions[partition - 1]; } diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 49b6ab4a48b55a9e731691b0df49c03f958bab87..2cd4478d00cf5dbccfd53f2f663b067e66acca41 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2191,7 +2191,7 @@ SmallVector Writer::createPhdrs(Partition &part) { return ret.back(); }; - unsigned partNo = part.getNumber(); + unsigned partNo = part.getNumber(ctx); bool isMain = partNo == 1; // Add the first PT_LOAD segment for regular output sections. @@ -2381,7 +2381,7 @@ SmallVector Writer::createPhdrs(Partition &part) { template void Writer::addPhdrForSection(Partition &part, unsigned shType, unsigned pType, unsigned pFlags) { - unsigned partNo = part.getNumber(); + unsigned partNo = part.getNumber(ctx); auto i = llvm::find_if(ctx.outputSections, [=](OutputSection *cmd) { return cmd->partition == partNo && cmd->type == shType; }); diff --git a/lld/test/COFF/weak-lazy.s b/lld/test/COFF/weak-lazy.s new file mode 100644 index 0000000000000000000000000000000000000000..2812ba7af8b5f85180e937b87a1615124ee1d675 --- /dev/null +++ b/lld/test/COFF/weak-lazy.s @@ -0,0 +1,18 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=i686-windows %s -o %t.obj +# RUN: llvm-lib -machine:x86 -out:%t-func.lib %t.obj + +# -export:func creates a weak alias to a lazy symbol. Make sure we can handle that when processing -export:func2=func. +# RUN: lld-link -dll -noentry -machine:x86 -out:%t.dll %t-func.lib -export:func -export:func2=func + + .text + .def @feat.00; + .scl 3; + .type 0; + .endef + .globl @feat.00 +.set @feat.00, 1 + .globl _func@0 +_func@0: + retl diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake index a60921990cf775a07b8976697bebab0ab4d213fd..93ccd9c479c2b8cd10f04250902f26a61fa2c65b 100644 --- a/lldb/cmake/modules/LLDBConfig.cmake +++ b/lldb/cmake/modules/LLDBConfig.cmake @@ -188,7 +188,6 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/../clang/include") if (LLVM_COMPILER_IS_GCC_COMPATIBLE) # Disable GCC warnings - append("-Wno-deprecated-declarations" CMAKE_CXX_FLAGS) append("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS) append("-Wno-strict-aliasing" CMAKE_CXX_FLAGS) @@ -198,7 +197,6 @@ endif() # Disable Clang warnings if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - append("-Wno-deprecated-register" CMAKE_CXX_FLAGS) append("-Wno-vla-extension" CMAKE_CXX_FLAGS) endif() diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst index 2c7d7dbfad9a505ae2dbcb2fce9b219aec1f6357..b91077d660894a15a2efe2774e54d5c519cec9e0 100644 --- a/lldb/docs/index.rst +++ b/lldb/docs/index.rst @@ -81,9 +81,11 @@ are welcome: expected to work, with functionality improving rapidly. ARM and AArch64 support is more experimental, with more known issues than the others. -RISC-V support is in active development, refer to the -`tracking issue `_ -for the current status. +Support for the following architectures is in active development. For their +current state, follow the links to their respective issues: + +* `RISC-V `_ +* `LoongArch `_ Get Involved ------------ diff --git a/lldb/docs/use/aarch64-linux.md b/lldb/docs/use/aarch64-linux.md index 803f56d16f981ec453fef3e8fa8544f0a985589f..70432f57857a59c287ed14121710b6c180b910cc 100644 --- a/lldb/docs/use/aarch64-linux.md +++ b/lldb/docs/use/aarch64-linux.md @@ -160,7 +160,7 @@ Kernel does. ### Visibility of an Inactive ZA Register LLDB does not handle registers that can come and go at runtime (SVE changes -size but it does not dissappear). Therefore when `za` is not enabled, LLDB +size but it does not disappear). Therefore when `za` is not enabled, LLDB will return a block of 0s instead. This block will match the expected size of `za`: ``` @@ -183,9 +183,9 @@ If you want to know whether `za` is active or not, refer to bit 2 of the As for SVE, LLDB does not know how the debugee will use `za`, and therefore does not know how it would be best to display it. At any time any given -instrucion could interpret its contents as many kinds and sizes of data. +instruction could interpret its contents as many kinds and sizes of data. -So LLDB will default to showing `za` as one large vector of individual bytes. +So LLDB will default to showing `za` as one large vector of individual bytes. You can override this with a format option (see the SVE example above). ### Expression Evaluation @@ -228,4 +228,4 @@ bytes. ### Expression Evaluation `zt0`'s value and whether it is active or not will be saved prior to -expression evaluation and restored afterwards. \ No newline at end of file +expression evaluation and restored afterwards. diff --git a/lldb/include/lldb/Host/Editline.h b/lldb/include/lldb/Host/Editline.h index 9049b106f02a349605f77d6bd93c9304b964df2e..a02f90891599adbcd28c6dadb2560c733dfd035e 100644 --- a/lldb/include/lldb/Host/Editline.h +++ b/lldb/include/lldb/Host/Editline.h @@ -57,6 +57,23 @@ #include "llvm/ADT/FunctionExtras.h" +#if defined(__clang__) && defined(__has_warning) +#if __has_warning("-Wdeprecated-declarations") +#define LLDB_DEPRECATED_WARNING_DISABLE \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") +#define LLDB_DEPRECATED_WARNING_RESTORE _Pragma("clang diagnostic pop") +#endif +#elif defined(__GNUC__) && __GNUC__ > 6 +#define LLDB_DEPRECATED_WARNING_DISABLE \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#define LLDB_DEPRECATED_WARNING_RESTORE _Pragma("GCC diagnostic pop") +#else +#define LLDB_DEPRECATED_WARNING_DISABLE +#define LLDB_DEPRECATED_WARNING_RESTORE +#endif + namespace lldb_private { namespace line_editor { @@ -367,7 +384,9 @@ private: void SetGetCharacterFunction(EditlineGetCharCallbackType callbackFn); #if LLDB_EDITLINE_USE_WCHAR + LLDB_DEPRECATED_WARNING_DISABLE std::wstring_convert> m_utf8conv; + LLDB_DEPRECATED_WARNING_RESTORE #endif ::EditLine *m_editline = nullptr; EditlineHistorySP m_history_sp; diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index 1d5e6e0d75c7cb67bc16b5ec4bffeee12738a81d..63748a71f1122d83dea31a753ae2a4cdee6b7ebf 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -1267,7 +1267,7 @@ def run_vscode(dbg, args, options): def main(): parser = optparse.OptionParser( description=( - "A testing framework for the Visual Studio Code Debug " "Adaptor protocol" + "A testing framework for the Visual Studio Code Debug Adaptor protocol" ) ) diff --git a/lldb/source/Core/Progress.cpp b/lldb/source/Core/Progress.cpp index 27774ce7a5521b240e1c5040d7e4b8f4eba27ac2..c9a556472c06b66ae18a9d3b0022228fa8e4e930 100644 --- a/lldb/source/Core/Progress.cpp +++ b/lldb/source/Core/Progress.cpp @@ -151,10 +151,11 @@ void ProgressManager::Decrement(const Progress::ProgressData &progress_data) { std::lock_guard lock(m_entries_mutex); llvm::StringRef key = progress_data.title; - if (!m_entries.contains(key)) + auto it = m_entries.find(key); + if (it == m_entries.end()) return; - Entry &entry = m_entries[key]; + Entry &entry = it->second; entry.refcount--; if (entry.refcount == 0) { diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp index 97bcd4f7eec26f00d5a29837854d3d5d0fbe2c17..f92f25ed342a9cbc1ba7402ed683b79d5f9e39aa 100644 --- a/lldb/source/Expression/DWARFExpression.cpp +++ b/lldb/source/Expression/DWARFExpression.cpp @@ -860,10 +860,12 @@ llvm::Expected DWARFExpression::Evaluate( // TODO: Implement a real typed stack, and store the genericness of the value // there. auto to_generic = [&](auto v) { + // TODO: Avoid implicit trunc? + // See https://github.com/llvm/llvm-project/issues/112510. bool is_signed = std::is_signed::value; - return Scalar(llvm::APSInt( - llvm::APInt(8 * opcodes.GetAddressByteSize(), v, is_signed), - !is_signed)); + return Scalar(llvm::APSInt(llvm::APInt(8 * opcodes.GetAddressByteSize(), v, + is_signed, /*implicitTrunc=*/true), + !is_signed)); }; // The default kind is a memory location. This is updated by any diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp index 561ec228cdb23f9bab1ec789e32a6fa90917ac7b..60117cb5f0e615a61bba5c4fc92a64cdbf515dac 100644 --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -1574,7 +1574,9 @@ bool Editline::CompleteCharacter(char ch, EditLineGetCharType &out) { out = (unsigned char)ch; return true; #else + LLDB_DEPRECATED_WARNING_DISABLE std::codecvt_utf8 cvt; + LLDB_DEPRECATED_WARNING_RESTORE llvm::SmallString<4> input; for (;;) { const char *from_next; diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp index 6857a29b0399028e82c2d0c9f630e7e8d7a3f21c..03ea2f242d3c78d7084698c0592e167e5a545b64 100644 --- a/lldb/source/Host/common/Host.cpp +++ b/lldb/source/Host/common/Host.cpp @@ -90,30 +90,12 @@ using namespace lldb; using namespace lldb_private; #if !defined(__APPLE__) -#if !defined(_WIN32) -#include -void Host::SystemLog(Severity severity, llvm::StringRef message) { - static llvm::once_flag g_openlog_once; - llvm::call_once(g_openlog_once, - [] { openlog("lldb", LOG_PID | LOG_NDELAY, LOG_USER); }); - int level = LOG_DEBUG; - switch (severity) { - case lldb::eSeverityInfo: - level = LOG_INFO; - break; - case lldb::eSeverityWarning: - level = LOG_WARNING; - break; - case lldb::eSeverityError: - level = LOG_ERR; - break; - } - syslog(level, "%s", message.data()); -} -#else +// The system log is currently only meaningful on Darwin, where this means +// os_log. The meaning of a "system log" isn't as clear on other platforms, and +// therefore we don't providate a default implementation. Vendors are free to +// to implement this function if they have a use for it. void Host::SystemLog(Severity severity, llvm::StringRef message) {} #endif -#endif static constexpr Log::Category g_categories[] = { {{"system"}, {"system log"}, SystemLog::System}}; diff --git a/lldb/source/Host/macosx/objcxx/CMakeLists.txt b/lldb/source/Host/macosx/objcxx/CMakeLists.txt index 273999f24380e5e1ab4e46ef9a7fea3cc111324f..1e693bed12ce15252de30f2cf63f92cedff796d6 100644 --- a/lldb/source/Host/macosx/objcxx/CMakeLists.txt +++ b/lldb/source/Host/macosx/objcxx/CMakeLists.txt @@ -16,4 +16,6 @@ add_lldb_library(lldbHostMacOSXObjCXX NO_PLUGIN_DEPENDENCIES TargetParser ) -target_compile_options(lldbHostMacOSXObjCXX PRIVATE -fno-objc-exceptions) +target_compile_options(lldbHostMacOSXObjCXX PRIVATE + -fno-objc-exceptions + -Wno-deprecated-declarations) diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp index 816581e70294a7cf625da2a3eba31f39b20851ca..6f8eaa55cfdf090f49fce7b803f423b16e786f3d 100644 --- a/lldb/source/Host/posix/MainLoopPosix.cpp +++ b/lldb/source/Host/posix/MainLoopPosix.cpp @@ -365,10 +365,7 @@ Status MainLoopPosix::Run() { Status error; RunImpl impl(*this); - // run until termination or until we run out of things to listen to - // (m_read_fds will always contain m_trigger_pipe fd, so check for > 1) - while (!m_terminate_request && - (m_read_fds.size() > 1 || !m_signals.empty())) { + while (!m_terminate_request) { error = impl.Poll(); if (error.Fail()) return error; diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp index 88d929535ab6c50a458891f7f9a5e50f51500a14..c9aa6d339d8f4856b01e429fc28467ff3d57dc25 100644 --- a/lldb/source/Host/windows/MainLoopWindows.cpp +++ b/lldb/source/Host/windows/MainLoopWindows.cpp @@ -116,9 +116,7 @@ Status MainLoopWindows::Run() { Status error; - // run until termination or until we run out of things to listen to - while (!m_terminate_request && !m_read_fds.empty()) { - + while (!m_terminate_request) { llvm::Expected signaled_event = Poll(); if (!signaled_event) return Status::FromError(signaled_event.takeError()); diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt index 7308374c8bfba6abfb73cf718890056b31145e7a..77a560541fcb1fe17f4fa2f1bc1b12d8b85b0328 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt @@ -1,7 +1,16 @@ +lldb_tablegen(DynamicLoaderDarwinProperties.inc -gen-lldb-property-defs + SOURCE DynamicLoaderDarwinProperties.td + TARGET LLDBPluginDynamicLoaderDarwinPropertiesGen) + +lldb_tablegen(DynamicLoaderDarwinPropertiesEnum.inc -gen-lldb-property-enum-defs + SOURCE DynamicLoaderDarwinProperties.td + TARGET LLDBPluginDynamicLoaderDarwinPropertiesEnumGen) + add_lldb_library(lldbPluginDynamicLoaderMacOSXDYLD PLUGIN DynamicLoaderMacOSXDYLD.cpp DynamicLoaderMacOS.cpp DynamicLoaderDarwin.cpp + DynamicLoaderDarwinProperties.cpp LINK_LIBS lldbBreakpoint @@ -16,3 +25,7 @@ add_lldb_library(lldbPluginDynamicLoaderMacOSXDYLD PLUGIN Support TargetParser ) + +add_dependencies(lldbPluginDynamicLoaderMacOSXDYLD + LLDBPluginDynamicLoaderDarwinPropertiesGen + LLDBPluginDynamicLoaderDarwinPropertiesEnumGen) diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp index 30242038a5f66f20ec1a6fbba6cefc0220548398..3659dfcd3c4ca3ec966e7489a9dd16a4186657d9 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp @@ -8,6 +8,7 @@ #include "DynamicLoaderDarwin.h" +#include "DynamicLoaderDarwinProperties.h" #include "lldb/Breakpoint/StoppointCallbackContext.h" #include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" @@ -31,6 +32,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/State.h" +#include "llvm/Support/ThreadPool.h" #include "Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.h" #include "Plugins/TypeSystem/Clang/TypeSystemClang.h" @@ -77,6 +79,17 @@ void DynamicLoaderDarwin::DidLaunch() { SetNotificationBreakpoint(); } +void DynamicLoaderDarwin::CreateSettings(lldb_private::Debugger &debugger) { + if (!PluginManager::GetSettingForDynamicLoaderPlugin( + debugger, DynamicLoaderDarwinProperties::GetSettingName())) { + const bool is_global_setting = true; + PluginManager::CreateSettingForDynamicLoaderPlugin( + debugger, + DynamicLoaderDarwinProperties::GetGlobal().GetValueProperties(), + "Properties for the DynamicLoaderDarwin plug-in.", is_global_setting); + } +} + // Clear out the state of this class. void DynamicLoaderDarwin::Clear(bool clear_process) { std::lock_guard guard(m_mutex); @@ -88,7 +101,7 @@ void DynamicLoaderDarwin::Clear(bool clear_process) { } ModuleSP DynamicLoaderDarwin::FindTargetModuleForImageInfo( - ImageInfo &image_info, bool can_create, bool *did_create_ptr) { + const ImageInfo &image_info, bool can_create, bool *did_create_ptr) { if (did_create_ptr) *did_create_ptr = false; @@ -517,8 +530,8 @@ bool DynamicLoaderDarwin::JSONImageInformationIntoImageInfo( return true; } -void DynamicLoaderDarwin::UpdateSpecialBinariesFromNewImageInfos( - ImageInfo::collection &image_infos) { +void DynamicLoaderDarwin::UpdateSpecialBinariesFromPreloadedModules( + std::vector> &images) { uint32_t exe_idx = UINT32_MAX; uint32_t dyld_idx = UINT32_MAX; Target &target = m_process->GetTarget(); @@ -526,35 +539,34 @@ void DynamicLoaderDarwin::UpdateSpecialBinariesFromNewImageInfos( ConstString g_dyld_sim_filename("dyld_sim"); ArchSpec target_arch = target.GetArchitecture(); - const size_t image_infos_size = image_infos.size(); - for (size_t i = 0; i < image_infos_size; i++) { - if (image_infos[i].header.filetype == llvm::MachO::MH_DYLINKER) { + const size_t images_size = images.size(); + for (size_t i = 0; i < images_size; i++) { + const auto &image_info = images[i].first; + if (image_info.header.filetype == llvm::MachO::MH_DYLINKER) { // In a "simulator" process we will have two dyld modules -- // a "dyld" that we want to keep track of, and a "dyld_sim" which // we don't need to keep track of here. dyld_sim will have a non-macosx // OS. if (target_arch.GetTriple().getEnvironment() == llvm::Triple::Simulator && - image_infos[i].os_type != llvm::Triple::OSType::MacOSX) { + image_info.os_type != llvm::Triple::OSType::MacOSX) { continue; } dyld_idx = i; } - if (image_infos[i].header.filetype == llvm::MachO::MH_EXECUTE) { + if (image_info.header.filetype == llvm::MachO::MH_EXECUTE) { exe_idx = i; } } // Set the target executable if we haven't found one so far. if (exe_idx != UINT32_MAX && !target.GetExecutableModule()) { - const bool can_create = true; - ModuleSP exe_module_sp(FindTargetModuleForImageInfo(image_infos[exe_idx], - can_create, nullptr)); + ModuleSP exe_module_sp = images[exe_idx].second; if (exe_module_sp) { LLDB_LOGF(log, "Found executable module: %s", exe_module_sp->GetFileSpec().GetPath().c_str()); target.GetImages().AppendIfNeeded(exe_module_sp); - UpdateImageLoadAddress(exe_module_sp.get(), image_infos[exe_idx]); + UpdateImageLoadAddress(exe_module_sp.get(), images[exe_idx].first); if (exe_module_sp.get() != target.GetExecutableModulePointer()) target.SetExecutableModule(exe_module_sp, eLoadDependentsNo); @@ -581,14 +593,12 @@ void DynamicLoaderDarwin::UpdateSpecialBinariesFromNewImageInfos( } if (dyld_idx != UINT32_MAX) { - const bool can_create = true; - ModuleSP dyld_sp = FindTargetModuleForImageInfo(image_infos[dyld_idx], - can_create, nullptr); + ModuleSP dyld_sp = images[dyld_idx].second; if (dyld_sp.get()) { LLDB_LOGF(log, "Found dyld module: %s", dyld_sp->GetFileSpec().GetPath().c_str()); target.GetImages().AppendIfNeeded(dyld_sp); - UpdateImageLoadAddress(dyld_sp.get(), image_infos[dyld_idx]); + UpdateImageLoadAddress(dyld_sp.get(), images[dyld_idx].first); SetDYLDModule(dyld_sp); } } @@ -642,26 +652,58 @@ ModuleSP DynamicLoaderDarwin::GetDYLDModule() { void DynamicLoaderDarwin::ClearDYLDModule() { m_dyld_module_wp.reset(); } +std::vector> +DynamicLoaderDarwin::PreloadModulesFromImageInfos( + const ImageInfo::collection &image_infos) { + const auto size = image_infos.size(); + std::vector> images(size); + auto LoadImage = [&](size_t i, ImageInfo::collection::const_iterator it) { + const auto &image_info = *it; + images[i] = std::make_pair( + image_info, FindTargetModuleForImageInfo(image_info, true, nullptr)); + }; + auto it = image_infos.begin(); + bool is_parallel_load = + DynamicLoaderDarwinProperties::GetGlobal().GetEnableParallelImageLoad(); + if (is_parallel_load) { + llvm::ThreadPoolTaskGroup taskGroup(Debugger::GetThreadPool()); + for (size_t i = 0; i < size; ++i, ++it) { + taskGroup.async(LoadImage, i, it); + } + taskGroup.wait(); + } else { + for (size_t i = 0; i < size; ++i, ++it) { + LoadImage(i, it); + } + } + return images; +} + bool DynamicLoaderDarwin::AddModulesUsingImageInfos( ImageInfo::collection &image_infos) { std::lock_guard guard(m_mutex); + auto images = PreloadModulesFromImageInfos(image_infos); + return AddModulesUsingPreloadedModules(images); +} + +bool DynamicLoaderDarwin::AddModulesUsingPreloadedModules( + std::vector> &images) { + std::lock_guard guard(m_mutex); // Now add these images to the main list. ModuleList loaded_module_list; Log *log = GetLog(LLDBLog::DynamicLoader); Target &target = m_process->GetTarget(); ModuleList &target_images = target.GetImages(); - for (uint32_t idx = 0; idx < image_infos.size(); ++idx) { + for (uint32_t idx = 0; idx < images.size(); ++idx) { + auto &image_info = images[idx].first; + const auto &image_module_sp = images[idx].second; if (log) { LLDB_LOGF(log, "Adding new image at address=0x%16.16" PRIx64 ".", - image_infos[idx].address); - image_infos[idx].PutToLog(log); + image_info.address); + image_info.PutToLog(log); } - - m_dyld_image_infos.push_back(image_infos[idx]); - - ModuleSP image_module_sp( - FindTargetModuleForImageInfo(image_infos[idx], true, nullptr)); + m_dyld_image_infos.push_back(image_info); if (image_module_sp) { ObjectFile *objfile = image_module_sp->GetObjectFile(); @@ -673,7 +715,7 @@ bool DynamicLoaderDarwin::AddModulesUsingImageInfos( sections->FindSectionByName(commpage_dbstr).get(); if (commpage_section) { ModuleSpec module_spec(objfile->GetFileSpec(), - image_infos[idx].GetArchitecture()); + image_info.GetArchitecture()); module_spec.GetObjectName() = commpage_dbstr; ModuleSP commpage_image_module_sp( target_images.FindFirstModule(module_spec)); @@ -686,17 +728,17 @@ bool DynamicLoaderDarwin::AddModulesUsingImageInfos( if (!commpage_image_module_sp || commpage_image_module_sp->GetObjectFile() == nullptr) { commpage_image_module_sp = m_process->ReadModuleFromMemory( - image_infos[idx].file_spec, image_infos[idx].address); + image_info.file_spec, image_info.address); // Always load a memory image right away in the target in case // we end up trying to read the symbol table from memory... The // __LINKEDIT will need to be mapped so we can figure out where // the symbol table bits are... bool changed = false; UpdateImageLoadAddress(commpage_image_module_sp.get(), - image_infos[idx]); + image_info); target.GetImages().Append(commpage_image_module_sp); if (changed) { - image_infos[idx].load_stop_id = m_process->GetStopID(); + image_info.load_stop_id = m_process->GetStopID(); loaded_module_list.AppendIfNeeded(commpage_image_module_sp); } } @@ -709,14 +751,14 @@ bool DynamicLoaderDarwin::AddModulesUsingImageInfos( // address. We need to check this so we don't mention that all loaded // shared libraries are newly loaded each time we hit out dyld breakpoint // since dyld will list all shared libraries each time. - if (UpdateImageLoadAddress(image_module_sp.get(), image_infos[idx])) { + if (UpdateImageLoadAddress(image_module_sp.get(), image_info)) { target_images.AppendIfNeeded(image_module_sp); loaded_module_list.AppendIfNeeded(image_module_sp); } // To support macCatalyst and legacy iOS simulator, // update the module's platform with the DYLD info. - ArchSpec dyld_spec = image_infos[idx].GetArchitecture(); + ArchSpec dyld_spec = image_info.GetArchitecture(); auto &dyld_triple = dyld_spec.GetTriple(); if ((dyld_triple.getEnvironment() == llvm::Triple::MacABI && dyld_triple.getOS() == llvm::Triple::IOS) || diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h index 45c693163f8105b0e9986514682dcb908551af01..bc5464d76b9503a9855f6def0d7d51fb406f983a 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h @@ -58,6 +58,8 @@ public: std::optional GetStartAddress() override; + static void CreateSettings(lldb_private::Debugger &debugger); + protected: void PrivateInitialize(lldb_private::Process *process); @@ -174,7 +176,7 @@ protected: bool UnloadModuleSections(lldb_private::Module *module, ImageInfo &info); - lldb::ModuleSP FindTargetModuleForImageInfo(ImageInfo &image_info, + lldb::ModuleSP FindTargetModuleForImageInfo(const ImageInfo &image_info, bool can_create, bool *did_create_ptr); @@ -201,11 +203,18 @@ protected: lldb_private::StructuredData::ObjectSP image_details, ImageInfo::collection &image_infos); - // If image_infos contains / may contain dyld or executable image, call this - // method - // to keep our internal record keeping of the special binaries up-to-date. - void - UpdateSpecialBinariesFromNewImageInfos(ImageInfo::collection &image_infos); + // Finds/loads modules for a given `image_infos` and returns pairs + // (ImageInfo, ModuleSP). + // Prefer using this method rather than calling `FindTargetModuleForImageInfo` + // directly as this method may load the modules in parallel. + std::vector> + PreloadModulesFromImageInfos(const ImageInfo::collection &image_infos); + + // If `images` contains / may contain dyld or executable image, call this + // method to keep our internal record keeping of the special binaries + // up-to-date. + void UpdateSpecialBinariesFromPreloadedModules( + std::vector> &images); // if image_info is a dyld binary, call this method bool UpdateDYLDImageInfoFromNewImageInfo(ImageInfo &image_info); @@ -215,6 +224,8 @@ protected: void AddExecutableModuleIfInImageInfos(ImageInfo::collection &image_infos); bool AddModulesUsingImageInfos(ImageInfo::collection &image_infos); + bool AddModulesUsingPreloadedModules( + std::vector> &images); // Whether we should use the new dyld SPI to get shared library information, // or read diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f4d8a071e6d5de0562d0130b8a413667f102cced --- /dev/null +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp @@ -0,0 +1,53 @@ +//===-- DynamicLoaderDarwinProperties.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DynamicLoaderDarwinProperties.h" + +using namespace lldb_private; + +#define LLDB_PROPERTIES_dynamicloaderdarwin_experimental +#include "DynamicLoaderDarwinProperties.inc" + +enum { +#define LLDB_PROPERTIES_dynamicloaderdarwin_experimental +#include "DynamicLoaderDarwinPropertiesEnum.inc" +}; + +llvm::StringRef DynamicLoaderDarwinProperties::GetSettingName() { + static constexpr llvm::StringLiteral g_setting_name("darwin"); + return g_setting_name; +} + +DynamicLoaderDarwinProperties::ExperimentalProperties::ExperimentalProperties() + : Properties(std::make_shared( + GetExperimentalSettingsName())) { + m_collection_sp->Initialize(g_dynamicloaderdarwin_experimental_properties); +} + +DynamicLoaderDarwinProperties::DynamicLoaderDarwinProperties() + : Properties(std::make_shared(GetSettingName())), + m_experimental_properties(std::make_unique()) { + m_collection_sp->AppendProperty( + Properties::GetExperimentalSettingsName(), + "Experimental settings - setting these won't produce errors if the " + "setting is not present.", + true, m_experimental_properties->GetValueProperties()); +} + +bool DynamicLoaderDarwinProperties::GetEnableParallelImageLoad() const { + return m_experimental_properties->GetPropertyAtIndexAs( + ePropertyEnableParallelImageLoad, + g_dynamicloaderdarwin_experimental_properties + [ePropertyEnableParallelImageLoad] + .default_uint_value != 0); +} + +DynamicLoaderDarwinProperties &DynamicLoaderDarwinProperties::GetGlobal() { + static DynamicLoaderDarwinProperties g_settings; + return g_settings; +} diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h new file mode 100644 index 0000000000000000000000000000000000000000..4c5e800c4f3e4c81c13f325ca20d2c051f2cd1bd --- /dev/null +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h @@ -0,0 +1,34 @@ +//===-- DynamicLoaderDarwinProperties.h -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H +#define LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H + +#include "lldb/Core/UserSettingsController.h" + +namespace lldb_private { + +class DynamicLoaderDarwinProperties : public Properties { +public: + class ExperimentalProperties : public Properties { + public: + ExperimentalProperties(); + }; + static llvm::StringRef GetSettingName(); + static DynamicLoaderDarwinProperties &GetGlobal(); + DynamicLoaderDarwinProperties(); + ~DynamicLoaderDarwinProperties() override = default; + bool GetEnableParallelImageLoad() const; + +private: + std::unique_ptr m_experimental_properties; +}; + +} // namespace lldb_private + +#endif // LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td new file mode 100644 index 0000000000000000000000000000000000000000..c54580ce347296fce6d5c037f2503e6447c88ffa --- /dev/null +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td @@ -0,0 +1,8 @@ +include "../../../../include/lldb/Core/PropertiesBase.td" + +let Definition = "dynamicloaderdarwin_experimental" in { + def EnableParallelImageLoad: Property<"enable-parallel-image-load", "Boolean">, + Global, + DefaultTrue, + Desc<"Load images in parallel.">; +} diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp index a038b65d4728349cf8bedf6a7bd0f3145a5cffe9..82555d1e028b4cd39e77b65245fff2020c3cb683 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp @@ -215,8 +215,9 @@ void DynamicLoaderMacOS::DoInitialImageFetch() { LLDB_LOGF(log, "Initial module fetch: Adding %" PRId64 " modules.\n", (uint64_t)image_infos.size()); - UpdateSpecialBinariesFromNewImageInfos(image_infos); - AddModulesUsingImageInfos(image_infos); + auto images = PreloadModulesFromImageInfos(image_infos); + UpdateSpecialBinariesFromPreloadedModules(images); + AddModulesUsingPreloadedModules(images); } } @@ -425,8 +426,9 @@ void DynamicLoaderMacOS::AddBinaries( ->GetAsArray() ->GetSize() == load_addresses.size()) { if (JSONImageInformationIntoImageInfo(binaries_info_sp, image_infos)) { - UpdateSpecialBinariesFromNewImageInfos(image_infos); - AddModulesUsingImageInfos(image_infos); + auto images = PreloadModulesFromImageInfos(image_infos); + UpdateSpecialBinariesFromPreloadedModules(images); + AddModulesUsingPreloadedModules(images); } m_dyld_image_infos_stop_id = m_process->GetStopID(); } diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp index debd0f6ee83f4049c1d6a2acf8dd7135fe912fbc..8fc77cbe11701297010c76eea5a39b98f34705e9 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp @@ -572,8 +572,9 @@ bool DynamicLoaderMacOSXDYLD::AddModulesUsingImageInfosAddress( ->GetSize() == image_infos_count) { bool return_value = false; if (JSONImageInformationIntoImageInfo(image_infos_json_sp, image_infos)) { - UpdateSpecialBinariesFromNewImageInfos(image_infos); - return_value = AddModulesUsingImageInfos(image_infos); + auto images = PreloadModulesFromImageInfos(image_infos); + UpdateSpecialBinariesFromPreloadedModules(images); + return_value = AddModulesUsingPreloadedModules(images); } m_dyld_image_infos_stop_id = m_process->GetStopID(); return return_value; @@ -1147,7 +1148,8 @@ bool DynamicLoaderMacOSXDYLD::IsFullyInitialized() { void DynamicLoaderMacOSXDYLD::Initialize() { PluginManager::RegisterPlugin(GetPluginNameStatic(), - GetPluginDescriptionStatic(), CreateInstance); + GetPluginDescriptionStatic(), CreateInstance, + DebuggerInitialize); DynamicLoaderMacOS::Initialize(); } @@ -1156,6 +1158,11 @@ void DynamicLoaderMacOSXDYLD::Terminate() { PluginManager::UnregisterPlugin(CreateInstance); } +void DynamicLoaderMacOSXDYLD::DebuggerInitialize( + lldb_private::Debugger &debugger) { + CreateSettings(debugger); +} + llvm::StringRef DynamicLoaderMacOSXDYLD::GetPluginDescriptionStatic() { return "Dynamic loader plug-in that watches for shared library loads/unloads " "in MacOSX user processes."; diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h index ae7451722a8d754ae985626c62d14e54b4e2a194..924e2fc107743c98e12d9d12945736b577e2f39c 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h @@ -50,6 +50,8 @@ public: static lldb_private::DynamicLoader * CreateInstance(lldb_private::Process *process, bool force); + static void DebuggerInitialize(lldb_private::Debugger &debugger); + /// Called after attaching a process. /// /// Allow DynamicLoader plug-ins to execute some code after diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index f6c16b6e3d96aec9c73adfaa12827e2fef386815..bcac5edbc1a79343742c53b6cf9c3c4f5ce217b7 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -75,8 +75,7 @@ Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() { StopInfoSP stop_info_sp = thread_sp->GetStopInfo(); if (stop_info_sp) { const StopReason &stop_reason = stop_info_sp->GetStopReason(); - if (stop_reason == StopReason::eStopReasonException || - stop_reason == StopReason::eStopReasonSignal) + if (stop_reason != lldb::eStopReasonInvalid) m_expected_directories++; } } @@ -685,50 +684,45 @@ Status MinidumpFileBuilder::AddExceptions() { Status error; for (const ThreadSP &thread_sp : thread_list) { StopInfoSP stop_info_sp = thread_sp->GetStopInfo(); - bool add_exception = false; - if (stop_info_sp) { - switch (stop_info_sp->GetStopReason()) { - case eStopReasonSignal: - case eStopReasonException: - add_exception = true; - break; - default: - break; - } - } - if (add_exception) { - constexpr size_t minidump_exception_size = - sizeof(llvm::minidump::ExceptionStream); - error = AddDirectory(StreamType::Exception, minidump_exception_size); - if (error.Fail()) - return error; + // If we don't have a stop info, or if it's invalid, skip. + if (!stop_info_sp || + stop_info_sp->GetStopReason() == lldb::eStopReasonInvalid) + continue; - StopInfoSP stop_info_sp = thread_sp->GetStopInfo(); - RegisterContextSP reg_ctx_sp(thread_sp->GetRegisterContext()); - Exception exp_record = {}; - exp_record.ExceptionCode = - static_cast(stop_info_sp->GetValue()); - exp_record.ExceptionFlags = static_cast(0); - exp_record.ExceptionRecord = static_cast(0); - exp_record.ExceptionAddress = reg_ctx_sp->GetPC(); - exp_record.NumberParameters = static_cast(0); - exp_record.UnusedAlignment = static_cast(0); - // exp_record.ExceptionInformation; - - ExceptionStream exp_stream; - exp_stream.ThreadId = - static_cast(thread_sp->GetID()); - exp_stream.UnusedAlignment = static_cast(0); - exp_stream.ExceptionRecord = exp_record; - auto Iter = m_tid_to_reg_ctx.find(thread_sp->GetID()); - if (Iter != m_tid_to_reg_ctx.end()) { - exp_stream.ThreadContext = Iter->second; - } else { - exp_stream.ThreadContext.DataSize = 0; - exp_stream.ThreadContext.RVA = 0; - } - m_data.AppendData(&exp_stream, minidump_exception_size); + constexpr size_t minidump_exception_size = + sizeof(llvm::minidump::ExceptionStream); + error = AddDirectory(StreamType::Exception, minidump_exception_size); + if (error.Fail()) + return error; + + RegisterContextSP reg_ctx_sp(thread_sp->GetRegisterContext()); + Exception exp_record = {}; + exp_record.ExceptionCode = + static_cast(stop_info_sp->GetValue()); + exp_record.ExceptionFlags = + static_cast(Exception::LLDB_FLAG); + exp_record.ExceptionRecord = static_cast(0); + exp_record.ExceptionAddress = reg_ctx_sp->GetPC(); + exp_record.NumberParameters = static_cast(1); + std::string description = stop_info_sp->GetDescription(); + // We have 120 bytes to work with and it's unlikely description will + // overflow, but we gotta check. + memcpy(&exp_record.ExceptionInformation, description.c_str(), + std::max(description.size(), Exception::MaxParameterBytes)); + exp_record.UnusedAlignment = static_cast(0); + ExceptionStream exp_stream; + exp_stream.ThreadId = + static_cast(thread_sp->GetID()); + exp_stream.UnusedAlignment = static_cast(0); + exp_stream.ExceptionRecord = exp_record; + auto Iter = m_tid_to_reg_ctx.find(thread_sp->GetID()); + if (Iter != m_tid_to_reg_ctx.end()) { + exp_stream.ThreadContext = Iter->second; + } else { + exp_stream.ThreadContext.DataSize = 0; + exp_stream.ThreadContext.RVA = 0; } + m_data.AppendData(&exp_stream, minidump_exception_size); } return error; diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h index a4240f871c8a2f5efdb22d9a8a122958a6472f4b..58b284608bd535285bd83eb1d4fffd9dfb3b5619 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h @@ -175,5 +175,4 @@ private: lldb::FileUP m_core_file; lldb_private::SaveCoreOptions m_save_core_options; }; - #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_MINIDUMPFILEBUILDER_H diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp index 1a6defbff3543c6cec4da2863a208c0167239337..7adc00622ec2d06c3077ca93f3b08f007caa118c 100644 --- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp +++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp @@ -119,17 +119,15 @@ NativeRegisterContextFreeBSD_arm64::ReadRegister(const RegisterInfo *reg_info, RegisterValue ®_value) { Status error; - if (!reg_info) { - error = Status::FromErrorString("reg_info NULL"); - return error; - } + if (!reg_info) + return Status::FromErrorString("reg_info NULL"); const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB]; if (reg == LLDB_INVALID_REGNUM) - return Status("no lldb regnum for %s", reg_info && reg_info->name - ? reg_info->name - : ""); + return Status::FromErrorStringWithFormat( + "no lldb regnum for %s", + reg_info && reg_info->name ? reg_info->name : ""); uint32_t set = GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg); error = ReadRegisterSet(set); @@ -147,14 +145,14 @@ Status NativeRegisterContextFreeBSD_arm64::WriteRegister( Status error; if (!reg_info) - return Status("reg_info NULL"); + return Status::FromErrorString("reg_info NULL"); const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB]; if (reg == LLDB_INVALID_REGNUM) - return Status("no lldb regnum for %s", reg_info && reg_info->name - ? reg_info->name - : ""); + return Status::FromErrorStringWithFormat( + "no lldb regnum for %s", + reg_info && reg_info->name ? reg_info->name : ""); uint32_t set = GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg); error = ReadRegisterSet(set); diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 3e09c316d74f444433987436812f9c7ee06d4546..538c8680140091dbc25ef5567fea6a9280bfc5aa 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -5323,9 +5323,8 @@ std::string ProcessGDBRemote::HarmonizeThreadIdsForProfileData( uint32_t prev_used_usec = 0; std::map::iterator iterator = m_thread_id_to_used_usec_map.find(thread_id); - if (iterator != m_thread_id_to_used_usec_map.end()) { - prev_used_usec = m_thread_id_to_used_usec_map[thread_id]; - } + if (iterator != m_thread_id_to_used_usec_map.end()) + prev_used_usec = iterator->second; uint32_t real_used_usec = curr_used_usec - prev_used_usec; // A good first time record is one that runs for at least 0.25 sec diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index 5ea3db23f114c4ef0b831b439d9182f0184af7e4..5b0df72130c16193a19342fd17d9dedc253e8667 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -276,8 +276,16 @@ void ProcessMinidump::RefreshStateAfterStop() { // No stop. return; } - - stop_info = StopInfo::CreateStopReasonWithSignal(*stop_thread, signo); + const char *description = nullptr; + if (exception_stream.ExceptionRecord.ExceptionFlags == + llvm::minidump::Exception::LLDB_FLAG) + description = reinterpret_cast( + exception_stream.ExceptionRecord.ExceptionInformation); + + llvm::StringRef description_str(description, + Exception::MaxParameterBytes); + stop_info = StopInfo::CreateStopReasonWithSignal( + *stop_thread, signo, description_str.str().c_str()); } else if (arch.GetTriple().getVendor() == llvm::Triple::Apple) { stop_info = StopInfoMachException::CreateStopReasonWithMachException( *stop_thread, exception_stream.ExceptionRecord.ExceptionCode, 2, diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index fe0c53a7e9a3ea5b817c91dce6b361b178e3d979..50115a638b9589fd6f263747061d08c5f9d951bf 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -5065,6 +5065,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::BuiltinType::SveUint64x2: case clang::BuiltinType::SveUint64x3: case clang::BuiltinType::SveUint64x4: + case clang::BuiltinType::SveMFloat8: case clang::BuiltinType::SveFloat16: case clang::BuiltinType::SveBFloat16: case clang::BuiltinType::SveBFloat16x2: diff --git a/lldb/source/Utility/DiagnosticsRendering.cpp b/lldb/source/Utility/DiagnosticsRendering.cpp index 96caf934cc23155a50f7af2ef4b94383934ab485..d28a9ab8958ba6fb66b93c8bed6dae1fa919af28 100644 --- a/lldb/source/Utility/DiagnosticsRendering.cpp +++ b/lldb/source/Utility/DiagnosticsRendering.cpp @@ -77,11 +77,7 @@ void RenderDiagnosticDetails(Stream &stream, spacer = ""; } - // Print a line with caret indicator(s) below the lldb prompt + command. - const size_t padding = *offset_in_command; - stream << std::string(padding, ' '); - - size_t offset = 1; + // Partition the diagnostics. std::vector remaining_details, other_details, hidden_details; for (const DiagnosticDetail &detail : details) { @@ -98,14 +94,39 @@ void RenderDiagnosticDetails(Stream &stream, continue; } - auto &loc = *detail.source_location; remaining_details.push_back(detail); - if (offset > loc.column) - continue; - stream << std::string(loc.column - offset, ' ') << cursor; - for (unsigned i = 0; i + 1 < loc.length; ++i) - stream << underline; - offset = loc.column + 1; + } + + // Sort the diagnostics. + auto sort = [](auto &ds) { + llvm::sort(ds.begin(), ds.end(), [](auto &d1, auto &d2) { + auto l1 = d1.source_location.value_or(DiagnosticDetail::SourceLocation{}); + auto l2 = d2.source_location.value_or(DiagnosticDetail::SourceLocation{}); + return std::pair(l1.line, l2.column) < std::pair(l1.line, l2.column); + }); + }; + sort(remaining_details); + sort(other_details); + sort(hidden_details); + + // Print a line with caret indicator(s) below the lldb prompt + command. + const size_t padding = *offset_in_command; + stream << std::string(padding, ' '); + { + size_t x_pos = 1; + for (const DiagnosticDetail &detail : remaining_details) { + auto &loc = *detail.source_location; + + if (x_pos > loc.column) + continue; + + stream << std::string(loc.column - x_pos, ' ') << cursor; + ++x_pos; + for (unsigned i = 0; i + 1 < loc.length; ++i) { + stream << underline; + ++x_pos; + } + } } stream << '\n'; @@ -117,18 +138,19 @@ void RenderDiagnosticDetails(Stream &stream, // Get the information to print this detail and remove it from the stack. // Print all the lines for all the other messages first. stream << std::string(padding, ' '); - size_t offset = 1; + size_t x_pos = 1; for (auto &remaining_detail : llvm::ArrayRef(remaining_details).drop_back(1)) { uint16_t column = remaining_detail.source_location->column; - stream << std::string(column - offset, ' ') << vbar; - offset = column + 1; + if (x_pos <= column) + stream << std::string(column - x_pos, ' ') << vbar; + x_pos = column + 1; } // Print the line connecting the ^ with the error message. uint16_t column = detail->source_location->column; - if (offset <= column) - stream << std::string(column - offset, ' ') << joint << hbar << spacer; + if (x_pos <= column) + stream << std::string(column - x_pos, ' ') << joint << hbar << spacer; // Print a colorized string based on the message's severity type. PrintSeverity(stream, detail->severity); diff --git a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py index 13ab6b0c9ac1fb99948face8d0182e805738880f..bafc762829621724cf6cb04ca4ef56b27f0d0d52 100644 --- a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py +++ b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py @@ -10,6 +10,7 @@ from lldbsuite.test import lldbutil class TestCase(TestBase): @add_test_categories(["libc++"]) @skipIf(compiler=no_match("clang")) + @skipIfLinux # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095 def test(self): self.build() diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py index 1c32222e64f14cce32c23f0049c4e0b94bb8aff5..71eaeef20e792d470f71f0afb26736f2cda10597 100644 --- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py +++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py @@ -14,6 +14,7 @@ class TestDbgInfoContentVector(TestBase): @skipIf(compiler="clang", compiler_version=["<", "12.0"]) @skipIf(macos_version=["<", "14.0"]) @skipIfDarwin # https://github.com/llvm/llvm-project/issues/106475 + @skipIfLinux # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095 def test(self): self.build() diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py index a1f33271f39d2fe2d2a99f78f5ed16235d718e6e..e9415fd53651f7b9c055522867e41951f79a9888 100644 --- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py +++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py @@ -10,6 +10,7 @@ from lldbsuite.test import lldbutil class TestVectorOfVectors(TestBase): @add_test_categories(["libc++"]) @skipIf(compiler=no_match("clang")) + @skipIfLinux # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095 def test(self): self.build() diff --git a/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile b/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile index a1f689e07c77ff906882d7da4bf0f66ef98a6cfc..d420a34c03e7857fdceb05ebde2f825d599f9309 100644 --- a/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile +++ b/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile @@ -1,4 +1,4 @@ - -CXX_SOURCES := main.cpp - -include Makefile.rules + +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/expression/top-level/Makefile b/lldb/test/API/commands/expression/top-level/Makefile index e5e9e78d4ead2aeea4f6bf4afdf7463df31b0bb9..51b27ddbb3c2f94d42b8ee4729695fc3ece5ae2f 100644 --- a/lldb/test/API/commands/expression/top-level/Makefile +++ b/lldb/test/API/commands/expression/top-level/Makefile @@ -5,6 +5,6 @@ all: dummy include Makefile.rules dummy: dummy.cpp - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ CXX_SOURCES=dummy.cpp EXE=dummy diff --git a/lldb/test/API/commands/expression/weak_symbols/Makefile b/lldb/test/API/commands/expression/weak_symbols/Makefile index 6fd8133312ade674a4b993f8b019ee5ea4cc9773..1636e9b3032632ce18b85bdefc79a6864f7dea9e 100644 --- a/lldb/test/API/commands/expression/weak_symbols/Makefile +++ b/lldb/test/API/commands/expression/weak_symbols/Makefile @@ -9,12 +9,12 @@ a.out: libdylib.dylib include Makefile.rules libdylib.dylib: dylib.c - $(MAKE) -C $(BUILDDIR) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -C $(BUILDDIR) -f $(MAKEFILE_RULES) \ C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \ CFLAGS_EXTRAS=-DHAS_THEM LD_EXTRAS=-dynamiclib hidden/libdylib.dylib: mkdir hidden - $(MAKE) -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \ + "$(MAKE)" -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \ C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \ LD_EXTRAS=-dynamiclib diff --git a/lldb/test/API/commands/target/create-deps/Makefile b/lldb/test/API/commands/target/create-deps/Makefile index 3e5b1049b5a19f2f3119ad5813748778e94c9542..866d550ee7d0aeb504d65d1abca248a6354d5126 100644 --- a/lldb/test/API/commands/target/create-deps/Makefile +++ b/lldb/test/API/commands/target/create-deps/Makefile @@ -6,5 +6,5 @@ a.out: libload_a include Makefile.rules libload_a: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=load_a DYLIB_CXX_SOURCES=a.cpp diff --git a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile index 0f3fb37bdadf3cf0a787c1fc95e59e59ab68f1cb..112210e7e2c60357a5dcdf84436fdacfaeec5ab2 100644 --- a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile +++ b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile @@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp USE_LIBDL := 1 lib_b: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b all: lib_b diff --git a/lldb/test/API/functionalities/dlopen_other_executable/Makefile b/lldb/test/API/functionalities/dlopen_other_executable/Makefile index 113b9fd7d3f18ec85b0232ba4e57fc01f366106a..51fc01bdde75a4d71e18aa04b3c6e9a1d5279289 100644 --- a/lldb/test/API/functionalities/dlopen_other_executable/Makefile +++ b/lldb/test/API/functionalities/dlopen_other_executable/Makefile @@ -2,7 +2,7 @@ C_SOURCES := main.c USE_LIBDL := 1 other: - $(MAKE) -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other + "$(MAKE)" -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other all: other include Makefile.rules diff --git a/lldb/test/API/functionalities/exec/Makefile b/lldb/test/API/functionalities/exec/Makefile index 8b9148ea8a3551cd952ed7498ed50ef3c01029ec..65d4680077d26d8320baf54069b55c91c1de6b91 100644 --- a/lldb/test/API/functionalities/exec/Makefile +++ b/lldb/test/API/functionalities/exec/Makefile @@ -5,5 +5,5 @@ all: secondprog include Makefile.rules secondprog: secondprog.cpp - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ CXX_SOURCES=secondprog.cpp EXE=secondprog diff --git a/lldb/test/API/functionalities/jitloader_gdb/Makefile b/lldb/test/API/functionalities/jitloader_gdb/Makefile index 357b1f83684fe2f45fb5e8d4753ed31de5cf399e..9998cc9cf833c7400811283b286c562caae17782 100644 --- a/lldb/test/API/functionalities/jitloader_gdb/Makefile +++ b/lldb/test/API/functionalities/jitloader_gdb/Makefile @@ -5,5 +5,5 @@ all: a.out simple include Makefile.rules simple: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ C_SOURCES=simple.c EXE=simple diff --git a/lldb/test/API/functionalities/limit-debug-info/Makefile b/lldb/test/API/functionalities/limit-debug-info/Makefile index 874b3a15e0fee9bf6e9446d15610948169cebee9..fa867a7aeb7c6cc46b63dab5db4d03c06ac6b11a 100644 --- a/lldb/test/API/functionalities/limit-debug-info/Makefile +++ b/lldb/test/API/functionalities/limit-debug-info/Makefile @@ -17,11 +17,11 @@ include Makefile.rules a.out: libone libtwo libone: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=one.cpp DYLIB_NAME=one \ CFLAGS_EXTRAS="$(ONE_CXXFLAGS)" libtwo: libone - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=two.cpp DYLIB_NAME=two \ CFLAGS_EXTRAS="$(TWO_CXXFLAGS)" LD_EXTRAS="-L. -lone" diff --git a/lldb/test/API/functionalities/load_after_attach/Makefile b/lldb/test/API/functionalities/load_after_attach/Makefile index 0f3fb37bdadf3cf0a787c1fc95e59e59ab68f1cb..112210e7e2c60357a5dcdf84436fdacfaeec5ab2 100644 --- a/lldb/test/API/functionalities/load_after_attach/Makefile +++ b/lldb/test/API/functionalities/load_after_attach/Makefile @@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp USE_LIBDL := 1 lib_b: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b all: lib_b diff --git a/lldb/test/API/functionalities/load_lazy/Makefile b/lldb/test/API/functionalities/load_lazy/Makefile index 81bc7dcb4d05f08a521979e830335e3a4db16f89..8e1d06b1e39c3f10edf915bdee623ec8f50325d6 100644 --- a/lldb/test/API/functionalities/load_lazy/Makefile +++ b/lldb/test/API/functionalities/load_lazy/Makefile @@ -17,13 +17,13 @@ else endif t1: t2_0 - $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + "$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_C_SOURCES=t1.c DYLIB_NAME=t1 LD_EXTRAS="-L. $(LINKFLAGS)" t2_0: - $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + "$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_0.c DYLIB_NAME=t2_0 t2_1: - $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + "$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_1.c DYLIB_NAME=t2_1 diff --git a/lldb/test/API/functionalities/load_unload/Makefile b/lldb/test/API/functionalities/load_unload/Makefile index e73ec73108764111b96fa5b6f94b7da6b065487d..dd7d16029427a130282ac536ed918bb822651fd6 100644 --- a/lldb/test/API/functionalities/load_unload/Makefile +++ b/lldb/test/API/functionalities/load_unload/Makefile @@ -7,25 +7,25 @@ a.out: lib_b lib_a lib_c lib_d hidden_lib_d include Makefile.rules lib_a: lib_b - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=loadunload_a \ LD_EXTRAS="-L. -lloadunload_b" lib_b: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=loadunload_b lib_c: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=loadunload_c lib_d: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d ifeq ($(OS),Darwin) install_name_tool -id @executable_path/libloadunload_d.dylib libloadunload_d.dylib endif hidden_lib_d: hidden - $(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \ + "$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d diff --git a/lldb/test/API/functionalities/load_using_paths/Makefile b/lldb/test/API/functionalities/load_using_paths/Makefile index 814a96013756a9bcedb7aed62bf541680a2d75a7..f973a389d585f39510f31304dd0355402b819e43 100644 --- a/lldb/test/API/functionalities/load_using_paths/Makefile +++ b/lldb/test/API/functionalities/load_using_paths/Makefile @@ -6,6 +6,6 @@ all: hidden_lib a.out include Makefile.rules hidden_lib: - $(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \ + "$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py index 5a0b6e790a424c14f75e4eac90c6403b7e466463..8776d72ecbc02703e528101d6fd3edc360186fca 100644 --- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py +++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py @@ -524,3 +524,23 @@ class MiniDumpNewTestCase(TestBase): self.assertStopReason(thread.GetStopReason(), lldb.eStopReasonSignal) stop_description = thread.GetStopDescription(256) self.assertIn("SIGSEGV", stop_description) + + def test_breakpoint_on_minidump(self): + """ + Test that LLDB breakpoints are recorded in Minidumps + """ + yaml = "linux-x86_64-exceptiondescription.yaml" + core = self.getBuildArtifact("breakpoint.core.dmp") + self.yaml2obj(yaml, core) + try: + # Create a target with the object file we just created from YAML + target = self.dbg.CreateTarget(None) + self.assertTrue(target, VALID_TARGET) + process = target.LoadCore(core) + self.assertTrue(process, VALID_PROCESS) + thread = process.GetThreadAtIndex(0) + stop_reason = thread.GetStopDescription(256) + self.assertIn("breakpoint 1.1", stop_reason) + finally: + if os.path.isfile(core): + os.unlink(core) diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64-exceptiondescription.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64-exceptiondescription.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bf26e05cd775aefa279bbaa4577e13f9a03ba489 --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64-exceptiondescription.yaml @@ -0,0 +1,37 @@ +--- !minidump +Streams: + - Type: SystemInfo + Processor Arch: AMD64 + Processor Level: 6 + Processor Revision: 15876 + Number of Processors: 40 + Platform ID: Linux + CSD Version: 'Linux 3.13.0-91-generic' + CPU: + Vendor ID: GenuineIntel + Version Info: 0x00000000 + Feature Info: 0x00000000 + - Type: ThreadList + Threads: + - Thread Id: 0x31F222 + Context: 00000000000000 + Stack: + Start of Memory Range: 0x7FFFFFFFD660 + Content: '' + - Type: Exception + Thread ID: 0x31F222 + Exception Record: + Exception Code: 0x2 + Exception Flags: 0x4C4C4442 + Exception Address: 0x555555556671 + Number of Parameters: 1 + Parameter 0: 0x696F706B61657262 + Parameter 1: 0x312E3120746E + Parameter 2: 0x1 + Parameter 3: 0x8000000000000000 + Parameter 4: 0x200000002 + Parameter 5: 0x8000000000000002 + Parameter 7: 0x555555556671 + Parameter 8: 0x1 + Thread Context: '' +... diff --git a/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms b/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms index cab06c1c9d50b1f41895de31c65163f998ddca82..e817a491af5750433ada17a846153d6ef460778a 100644 --- a/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms +++ b/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms @@ -1,2 +1,2 @@ -MODULE windows x86 0F45B7919A9646F9BF8F2D6076EA421A11 fizzbuzz.pdb -PUBLIC 1000 0 main +MODULE windows x86 0F45B7919A9646F9BF8F2D6076EA421A11 fizzbuzz.pdb +PUBLIC 1000 0 main diff --git a/lldb/test/API/functionalities/scripted_process/Makefile b/lldb/test/API/functionalities/scripted_process/Makefile index ba739451fc7ef3e9f979bbc242a2731441b186e0..d4f12fbb3c4ef42a00c1163b285e2f5acfaf03ee 100644 --- a/lldb/test/API/functionalities/scripted_process/Makefile +++ b/lldb/test/API/functionalities/scripted_process/Makefile @@ -9,7 +9,7 @@ CXXFLAGS_EXTRAS := -target $(TRIPLE) all: libbaz.dylib a.out libbaz.dylib: baz.cpp - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=baz DYLIB_CXX_SOURCES=baz.cpp include Makefile.rules diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile index 4abcab84eac2922c2dbe19e15c05b3f8b84333c8..e4b0e86c0c36c8ab55c74f4d98565c2217db9eb2 100644 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile @@ -6,11 +6,11 @@ a.out: lib_a lib_b include Makefile.rules lib_a: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=load_a lib_b: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=load_b diff --git a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile index 42c010be9a03a57bedb97861fc2409406e73ef14..963ce2ac94d92aceacaa554dc17c504048ab5e9b 100644 --- a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile +++ b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile @@ -10,4 +10,4 @@ a.out: lib_One lib_Two lib_One: lib_Two lib_%: - $(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL) + "$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL) diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile index 6c61d210eeb2f3f55fbfa16e831e9e537182f023..745f6cc9d65ae3905f75cedc5c39270e1fa5bfbc 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile +++ b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile @@ -1,23 +1,23 @@ -CXX_SOURCES := main.cpp +CXX_SOURCES := main.cpp LD_EXTRAS := -L. -l_d -l_c -l_a -l_b - -a.out: lib_b lib_a lib_c lib_d - -include Makefile.rules - -lib_a: lib_b - $(MAKE) -f $(MAKEFILE_RULES) \ + +a.out: lib_b lib_a lib_c lib_d + +include Makefile.rules + +lib_a: lib_b + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \ LD_EXTRAS="-L. -l_b" - -lib_b: - $(MAKE) -f $(MAKEFILE_RULES) \ + +lib_b: + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b - -lib_c: - $(MAKE) -f $(MAKEFILE_RULES) \ + +lib_c: + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c - -lib_d: - $(MAKE) -f $(MAKEFILE_RULES) \ + +lib_d: + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp index 778b46ed5cef1a0fa1751f2cd9e137ddaf1918fa..66633b70ee1e50793333cc3f08b33208f3cbef08 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp @@ -1,3 +1,3 @@ -extern "C" int b_function(); - -extern "C" int a_function() { return b_function(); } +extern "C" int b_function(); + +extern "C" int a_function() { return b_function(); } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp index 4f1a4032ee0eedd9298bac3823d046c52a4ecf9e..8b16fbdb5728cde6c931111791513e01ea20798b 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp @@ -1 +1 @@ -extern "C" int b_function() { return 500; } +extern "C" int b_function() { return 500; } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp index 8abd1b155a75904856db20904cd7551921142bfa..120c88f2bb609a8498bd05b8411c2be211140870 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp @@ -1 +1 @@ -extern "C" int c_function() { return 600; } +extern "C" int c_function() { return 600; } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp index 58888a29ba323aad0908b6bf03cb90821528e8bd..d37ad2621ae4e99790c6d6e33cd357e9fef42d6f 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp @@ -1 +1 @@ -extern "C" int d_function() { return 700; } +extern "C" int d_function() { return 700; } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp index 77b38c5ccdc6985612733b1d72cee1bea9427314..bd2c79cdab9daa6cc1d9de5103651e3aac20063d 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp @@ -1,16 +1,16 @@ -#include - -extern "C" int a_function(); -extern "C" int c_function(); -extern "C" int b_function(); -extern "C" int d_function(); - -int main() { - a_function(); - b_function(); - c_function(); - d_function(); - - puts("running"); // breakpoint here - return 0; -} +#include + +extern "C" int a_function(); +extern "C" int c_function(); +extern "C" int b_function(); +extern "C" int d_function(); + +int main() { + a_function(); + b_function(); + c_function(); + d_function(); + + puts("running"); // breakpoint here + return 0; +} diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile b/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile index 15a931850e17e589eaadfe2551e3fc1f0e151de9..10495940055b63d2b69fd0ee465e69dad1889d2f 100644 --- a/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile +++ b/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile @@ -1,3 +1,3 @@ -C_SOURCES := main.c - -include Makefile.rules +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py index d660844405e1374d97c233271d1a4f52f5e60a47..70f72c72c8340e2fff5d1290483b5b62722bc0f7 100644 --- a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py +++ b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py @@ -1,88 +1,88 @@ -""" -Test that line information is recalculated properly for a frame when it moves -from the middle of the backtrace to a zero index. - -This is a regression test for a StackFrame bug, where whether frame is zero or -not depends on an internal field. When LLDB was updating its frame list value -of the field wasn't copied into existing StackFrame instances, so those -StackFrame instances, would use an incorrect line entry evaluation logic in -situations if it was in the middle of the stack frame list (not zeroth), and -then moved to the top position. The difference in logic is that for zeroth -frames line entry is returned for program counter, while for other frame -(except for those that "behave like zeroth") it is for the instruction -preceding PC, as PC points to the next instruction after function call. When -the bug is present, when execution stops at the second breakpoint -SBFrame.GetLineEntry() returns line entry for the previous line, rather than -the one with a breakpoint. Note that this is specific to -SBFrame.GetLineEntry(), SBFrame.GetPCAddress().GetLineEntry() would return -correct entry. - -This bug doesn't reproduce through an LLDB interpretator, however it happens -when using API directly, for example in LLDB-MI. -""" - -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -class ZerothFrame(TestBase): - def test(self): - """ - Test that line information is recalculated properly for a frame when it moves - from the middle of the backtrace to a zero index. - """ - self.build() - self.setTearDownCleanup() - - exe = self.getBuildArtifact("a.out") - target = self.dbg.CreateTarget(exe) - self.assertTrue(target, VALID_TARGET) - - main_dot_c = lldb.SBFileSpec("main.c") - bp1 = target.BreakpointCreateBySourceRegex( - "// Set breakpoint 1 here", main_dot_c - ) - bp2 = target.BreakpointCreateBySourceRegex( - "// Set breakpoint 2 here", main_dot_c - ) - - process = target.LaunchSimple(None, None, self.get_process_working_directory()) - self.assertTrue(process, VALID_PROCESS) - - thread = self.thread() - - if self.TraceOn(): - print("Backtrace at the first breakpoint:") - for f in thread.frames: - print(f) - - # Check that we have stopped at correct breakpoint. - self.assertEqual( - thread.frame[0].GetLineEntry().GetLine(), - bp1.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), - "LLDB reported incorrect line number.", - ) - - # Important to use SBProcess::Continue() instead of - # self.runCmd('continue'), because the problem doesn't reproduce with - # 'continue' command. - process.Continue() - - if self.TraceOn(): - print("Backtrace at the second breakpoint:") - for f in thread.frames: - print(f) - # Check that we have stopped at the breakpoint - self.assertEqual( - thread.frame[0].GetLineEntry().GetLine(), - bp2.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), - "LLDB reported incorrect line number.", - ) - # Double-check with GetPCAddress() - self.assertEqual( - thread.frame[0].GetLineEntry().GetLine(), - thread.frame[0].GetPCAddress().GetLineEntry().GetLine(), - "LLDB reported incorrect line number.", - ) +""" +Test that line information is recalculated properly for a frame when it moves +from the middle of the backtrace to a zero index. + +This is a regression test for a StackFrame bug, where whether frame is zero or +not depends on an internal field. When LLDB was updating its frame list value +of the field wasn't copied into existing StackFrame instances, so those +StackFrame instances, would use an incorrect line entry evaluation logic in +situations if it was in the middle of the stack frame list (not zeroth), and +then moved to the top position. The difference in logic is that for zeroth +frames line entry is returned for program counter, while for other frame +(except for those that "behave like zeroth") it is for the instruction +preceding PC, as PC points to the next instruction after function call. When +the bug is present, when execution stops at the second breakpoint +SBFrame.GetLineEntry() returns line entry for the previous line, rather than +the one with a breakpoint. Note that this is specific to +SBFrame.GetLineEntry(), SBFrame.GetPCAddress().GetLineEntry() would return +correct entry. + +This bug doesn't reproduce through an LLDB interpretator, however it happens +when using API directly, for example in LLDB-MI. +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ZerothFrame(TestBase): + def test(self): + """ + Test that line information is recalculated properly for a frame when it moves + from the middle of the backtrace to a zero index. + """ + self.build() + self.setTearDownCleanup() + + exe = self.getBuildArtifact("a.out") + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + + main_dot_c = lldb.SBFileSpec("main.c") + bp1 = target.BreakpointCreateBySourceRegex( + "// Set breakpoint 1 here", main_dot_c + ) + bp2 = target.BreakpointCreateBySourceRegex( + "// Set breakpoint 2 here", main_dot_c + ) + + process = target.LaunchSimple(None, None, self.get_process_working_directory()) + self.assertTrue(process, VALID_PROCESS) + + thread = self.thread() + + if self.TraceOn(): + print("Backtrace at the first breakpoint:") + for f in thread.frames: + print(f) + + # Check that we have stopped at correct breakpoint. + self.assertEqual( + thread.frame[0].GetLineEntry().GetLine(), + bp1.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), + "LLDB reported incorrect line number.", + ) + + # Important to use SBProcess::Continue() instead of + # self.runCmd('continue'), because the problem doesn't reproduce with + # 'continue' command. + process.Continue() + + if self.TraceOn(): + print("Backtrace at the second breakpoint:") + for f in thread.frames: + print(f) + # Check that we have stopped at the breakpoint + self.assertEqual( + thread.frame[0].GetLineEntry().GetLine(), + bp2.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), + "LLDB reported incorrect line number.", + ) + # Double-check with GetPCAddress() + self.assertEqual( + thread.frame[0].GetLineEntry().GetLine(), + thread.frame[0].GetPCAddress().GetLineEntry().GetLine(), + "LLDB reported incorrect line number.", + ) diff --git a/lldb/test/API/lang/c/conflicting-symbol/Makefile b/lldb/test/API/lang/c/conflicting-symbol/Makefile index 81594a1265da244a6a0b21778d3be64e82e0b61f..1331c4e1ebfaf24e8ede2e72bc6cb506f3d87fa4 100644 --- a/lldb/test/API/lang/c/conflicting-symbol/Makefile +++ b/lldb/test/API/lang/c/conflicting-symbol/Makefile @@ -7,4 +7,4 @@ include Makefile.rules a.out: lib_One lib_Two lib_%: - $(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk + "$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk diff --git a/lldb/test/API/lang/cpp/incomplete-types/Makefile b/lldb/test/API/lang/cpp/incomplete-types/Makefile index f42ac2e81cc76e70dc36e455cfd60e19fa4c2c90..0cf3f6a31caa2fd04400f01930bd657b7a1a734b 100644 --- a/lldb/test/API/lang/cpp/incomplete-types/Makefile +++ b/lldb/test/API/lang/cpp/incomplete-types/Makefile @@ -16,7 +16,7 @@ main.o: CFLAGS_EXTRAS = -flimit-debug-info limit: a.o main.o mkdir -p build_limit - $(MAKE) -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \ + "$(MAKE)" -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \ EXE=../limit CXX_SOURCES="length.cpp ../a.o ../main.o" \ CFLAGS_EXTRAS=-flimit-debug-info NO_LIMIT_DEBUG_INFO_FLAGS="" diff --git a/lldb/test/API/lang/cpp/namespace_definitions/Makefile b/lldb/test/API/lang/cpp/namespace_definitions/Makefile index fc9165f67f428ec9a4bd96abc796631482798a4c..b17d70fc92871017a829d3aa6269784f78a6e289 100644 --- a/lldb/test/API/lang/cpp/namespace_definitions/Makefile +++ b/lldb/test/API/lang/cpp/namespace_definitions/Makefile @@ -6,10 +6,10 @@ a.out: liba libb include Makefile.rules liba: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=a DYLIB_CXX_SOURCES=a.cpp libb: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=b DYLIB_CXX_SOURCES=b.cpp diff --git a/lldb/test/API/lang/cpp/stl/Makefile b/lldb/test/API/lang/cpp/stl/Makefile index 8534fa9b00209e7571120ae8df618e9a8ddb3d28..99998b20bcb0502181cbf488a2034823e5f66d22 100644 --- a/lldb/test/API/lang/cpp/stl/Makefile +++ b/lldb/test/API/lang/cpp/stl/Makefile @@ -1,5 +1,3 @@ CXX_SOURCES := main.cpp -USE_SYSTEM_STDLIB := 1 - include Makefile.rules diff --git a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py index 8676ee16d83c04355c3806ea7e862bcb1e453f59..06f338b3ed1ded46eb82c51dad89ba31f7319e0a 100644 --- a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py +++ b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py @@ -43,7 +43,10 @@ class StdCXXDisassembleTestCase(TestBase): # At this point, lib_stdcxx is the full path to the stdc++ library and # module is the corresponding SBModule. - self.expect(lib_stdcxx, "Library StdC++ is located", exe=False, substrs=["lib"]) + if "lib" not in lib_stdcxx: + self.skipTest( + "This test requires libstdc++.so or libc++.dylib in the target's module list." + ) self.runCmd("image dump symtab '%s'" % lib_stdcxx) raw_output = self.res.GetOutput() diff --git a/lldb/test/API/lang/objc/conflicting-definition/Makefile b/lldb/test/API/lang/objc/conflicting-definition/Makefile index 00a0769a086f086c0f98fb32b07e7de83b1bd28c..cba79c94d46baacb85326434c41692643a12f2f5 100644 --- a/lldb/test/API/lang/objc/conflicting-definition/Makefile +++ b/lldb/test/API/lang/objc/conflicting-definition/Makefile @@ -9,14 +9,14 @@ include Makefile.rules libTest.dylib: Test/Test.m mkdir -p Test - $(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \ + "$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=Test DYLIB_OBJC_SOURCES=Test/Test.m \ LD_EXTRAS="-lobjc -framework Foundation" \ CFLAGS_EXTRAS=-I$(SRCDIR) libTestExt.dylib: TestExt/TestExt.m mkdir -p TestExt - $(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \ + "$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=TestExt DYLIB_OBJC_SOURCES=TestExt/TestExt.m \ LD_EXTRAS="-lobjc -framework Foundation -lTest -L." \ CFLAGS_EXTRAS=-I$(SRCDIR) diff --git a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile index 59bf009f68677bdfe3072222e20879f728d09b8e..57da670b69ab33d34709112b0b8abe19ea037254 100644 --- a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile +++ b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile @@ -5,7 +5,7 @@ USE_PRIVATE_MODULE_CACHE = YES .PHONY: update-module all: $(EXE) - $(MAKE) -f $(SRCDIR)/Makefile update-module + "$(MAKE)" -f $(SRCDIR)/Makefile update-module include Makefile.rules diff --git a/lldb/test/API/macosx/delay-init-dependency/Makefile b/lldb/test/API/macosx/delay-init-dependency/Makefile index 246ea0f34e1a1c6f34b7fcf6b4359342c7f65530..7421c68b79baa4d53a7e3076b78133b4c00bfaa8 100644 --- a/lldb/test/API/macosx/delay-init-dependency/Makefile +++ b/lldb/test/API/macosx/delay-init-dependency/Makefile @@ -7,5 +7,5 @@ all: build-libfoo a.out include Makefile.rules build-libfoo: foo.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES diff --git a/lldb/test/API/macosx/expedited-thread-pcs/Makefile b/lldb/test/API/macosx/expedited-thread-pcs/Makefile index 7799f06e770970b4b9d3c0d6096a7a8c080db9fc..73a969831e673267666bb7a87a72ecf7a3f94575 100644 --- a/lldb/test/API/macosx/expedited-thread-pcs/Makefile +++ b/lldb/test/API/macosx/expedited-thread-pcs/Makefile @@ -6,6 +6,6 @@ all: build-libfoo a.out include Makefile.rules build-libfoo: foo.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES diff --git a/lldb/test/API/macosx/indirect_symbol/Makefile b/lldb/test/API/macosx/indirect_symbol/Makefile index 9069302b39c4f860af56bdbc570c507caad98cb9..dee3e184fe19bf4ee2620a325fcad42cd586e4b7 100644 --- a/lldb/test/API/macosx/indirect_symbol/Makefile +++ b/lldb/test/API/macosx/indirect_symbol/Makefile @@ -7,11 +7,11 @@ all: build-libindirect build-libreepxoprt a.out include Makefile.rules build-libindirect: indirect.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES \ LD_EXTRAS="-Wl,-image_base,0x200000000" build-libreepxoprt: reexport.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_C_SOURCES=reexport.c DYLIB_NAME=reexport DYLIB_ONLY=YES \ LD_EXTRAS="-L. -lindirect -Wl,-alias_list,$(SRCDIR)/alias.list" diff --git a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile index 05d9552a802091249b81488e43f5b2df46804cb3..01b4acfdcfd2a1b82d849e6ddeca12c8a9357579 100644 --- a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile +++ b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile @@ -5,7 +5,7 @@ C_SOURCES := main.c all: a.out create-empty-corefile create-empty-corefile: - $(MAKE) -f $(MAKEFILE_RULES) EXE=create-empty-corefile \ + "$(MAKE)" -f $(MAKEFILE_RULES) EXE=create-empty-corefile \ CXX=$(CC) CXX_SOURCES=create-empty-corefile.cpp include Makefile.rules diff --git a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile index 8e561f17383fe1b6a65e558c906e0d4c24e8d75e..229235cda999f9ae5ddf1e3205ba863ac3848f1e 100644 --- a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile +++ b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile @@ -10,11 +10,11 @@ create-empty-corefile: CXX_SOURCES=create-multibin-corefile.cpp libone.dylib: one.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=one DYLIB_C_SOURCES=one.c libtwo.dylib: two.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=two DYLIB_C_SOURCES=two.c include Makefile.rules diff --git a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile index c77a186724fda1ae239ecd39a1d2ef2b8ee0b5de..0dc9e71c3276a3223b13872b1e95130cbe25fd30 100644 --- a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile +++ b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile @@ -11,7 +11,7 @@ override CC=xcrun clang all: libfoo.dylib a.out libfoo.dylib: foo.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_NAME=foo DYLIB_C_SOURCES=foo.c include Makefile.rules diff --git a/lldb/test/API/macosx/skinny-corefile/Makefile b/lldb/test/API/macosx/skinny-corefile/Makefile index efe37f3d2b8b2f0ea1cdd035e6f2056a5d111fba..fce43a36c33ac1ef2bfd7fda388c4159eda49fae 100644 --- a/lldb/test/API/macosx/skinny-corefile/Makefile +++ b/lldb/test/API/macosx/skinny-corefile/Makefile @@ -6,10 +6,10 @@ include Makefile.rules a.out: libto-be-removed libpresent libto-be-removed: libpresent - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_C_SOURCES=to-be-removed.c DYLIB_NAME=to-be-removed \ LD_EXTRAS="-L. -lpresent" libpresent: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_C_SOURCES=present.c DYLIB_NAME=present diff --git a/lldb/test/API/python_api/debugger/Makefile b/lldb/test/API/python_api/debugger/Makefile index bfad5f33e86753978f4cfc8240afb55e8431f552..99998b20bcb0502181cbf488a2034823e5f66d22 100644 --- a/lldb/test/API/python_api/debugger/Makefile +++ b/lldb/test/API/python_api/debugger/Makefile @@ -1,3 +1,3 @@ -CXX_SOURCES := main.cpp - -include Makefile.rules +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile index 30a6400184933dad7c2a1cdda35aa6a744ddcba3..7634f513e8523380b732ea9921e5fc4009fa2f36 100644 --- a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile +++ b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile @@ -15,5 +15,5 @@ main-copy.cpp: main.cpp # The following shared library will be used to test breakpoints under dynamic loading libother: other-copy.c - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_ONLY=YES DYLIB_C_SOURCES=other-copy.c DYLIB_NAME=other diff --git a/lldb/test/API/tools/lldb-dap/send-event/Makefile b/lldb/test/API/tools/lldb-dap/send-event/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..10495940055b63d2b69fd0ee465e69dad1889d2f --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/send-event/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py new file mode 100644 index 0000000000000000000000000000000000000000..de47651bb2fa1ba00883c9cc1f05d3ebc8a685fb --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py @@ -0,0 +1,67 @@ +""" +Test lldb-dap send-event integration. +""" + +import json + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +import lldbdap_testcase + + +class TestDAP_sendEvent(lldbdap_testcase.DAPTestCaseBase): + def test_send_event(self): + """ + Test sending a custom event. + """ + program = self.getBuildArtifact("a.out") + source = "main.c" + custom_event_body = { + "key": 321, + "arr": [True], + } + self.build_and_launch( + program, + stopCommands=[ + "lldb-dap send-event my-custom-event-no-body", + "lldb-dap send-event my-custom-event '{}'".format( + json.dumps(custom_event_body) + ), + ], + ) + + breakpoint_line = line_number(source, "// breakpoint") + + self.set_source_breakpoints(source, [breakpoint_line]) + self.continue_to_next_stop() + + custom_event = self.dap_server.wait_for_event( + filter=["my-custom-event-no-body"] + ) + self.assertEquals(custom_event["event"], "my-custom-event-no-body") + self.assertIsNone(custom_event.get("body", None)) + + custom_event = self.dap_server.wait_for_event(filter=["my-custom-event"]) + self.assertEquals(custom_event["event"], "my-custom-event") + self.assertEquals(custom_event["body"], custom_event_body) + + def test_send_internal_event(self): + """ + Test sending an internal event produces an error. + """ + program = self.getBuildArtifact("a.out") + source = "main.c" + self.build_and_launch(program) + + breakpoint_line = line_number(source, "// breakpoint") + + self.set_source_breakpoints(source, [breakpoint_line]) + self.continue_to_next_stop() + + resp = self.dap_server.request_evaluate( + "`lldb-dap send-event stopped", context="repl" + ) + self.assertRegex( + resp["body"]["result"], + r"Invalid use of lldb-dap send-event, event \"stopped\" should be handled by lldb-dap internally.", + ) diff --git a/lldb/test/API/tools/lldb-dap/send-event/main.c b/lldb/test/API/tools/lldb-dap/send-event/main.c new file mode 100644 index 0000000000000000000000000000000000000000..27bc22b94794b606972a166fbc9b14012005735c --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/send-event/main.c @@ -0,0 +1,6 @@ +#include + +int main(int argc, char const *argv[]) { + printf("example\n"); // breakpoint 1 + return 0; +} diff --git a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py index fd48e69cae5e252dec30de20bf64af17c45cc1c7..fd452d91e472bd68dc02cff1f4ca9fa363a59f2d 100644 --- a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py +++ b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py @@ -1,12 +1,10 @@ """ -Test lldb-dap startDebugging reverse request +Test lldb-dap start-debugging reverse requests. """ -import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil import lldbdap_testcase @@ -25,7 +23,7 @@ class TestDAP_startDebugging(lldbdap_testcase.DAPTestCaseBase): self.set_source_breakpoints(source, [breakpoint_line]) self.continue_to_next_stop() self.dap_server.request_evaluate( - "`lldb-dap startDebugging attach '{\"pid\":321}'", context="repl" + "`lldb-dap start-debugging attach '{\"pid\":321}'", context="repl" ) self.continue_to_exit() diff --git a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile index 5b5c1dcef783a5b69a104e24779408de7e53a90e..f13b1ac15928a3b09082c15b19fffd56e7aa0afb 100644 --- a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile +++ b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile @@ -9,11 +9,11 @@ a.out: svr4lib_a svr4lib_b_quote include Makefile.rules svr4lib_a: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_NAME=svr4lib_a DYLIB_CXX_SOURCES=svr4lib_a.cpp \ DYLIB_ONLY=YES svr4lib_b_quote: - $(MAKE) -f $(MAKEFILE_RULES) \ + "$(MAKE)" -f $(MAKEFILE_RULES) \ DYLIB_NAME=svr4lib_b\\\" DYLIB_CXX_SOURCES=svr4lib_b_quote.cpp \ DYLIB_ONLY=YES diff --git a/lldb/test/Shell/BuildScript/modes.test b/lldb/test/Shell/BuildScript/modes.test index 02311f712d770fb8c64e0d4ba3afd904d9649684..1ce50104855f464dcf9d1069c3a39e9ddf4efe3b 100644 --- a/lldb/test/Shell/BuildScript/modes.test +++ b/lldb/test/Shell/BuildScript/modes.test @@ -1,35 +1,35 @@ -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ -RUN: | FileCheck --check-prefix=COMPILE %s - -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ -RUN: | FileCheck --check-prefix=COMPILE-MULTI %s - -RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foo.exe foobar.obj \ -RUN: | FileCheck --check-prefix=LINK %s - -RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foobar.exe foo.obj bar.obj \ -RUN: | FileCheck --check-prefix=LINK-MULTI %s - -RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foobar.c \ -RUN: | FileCheck --check-prefix=BOTH %s - -RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foo.c bar.c \ -RUN: | FileCheck --check-prefix=BOTH-MULTI %s - - -COMPILE: compiling foobar.c -> foo.out - -COMPILE-MULTI: compiling foo.c -> foo.o{{(bj)?}} -COMPILE-MULTI: compiling bar.c -> bar.o{{(bj)?}} - - -LINK: linking foobar.obj -> foo.exe - -LINK-MULTI: linking foo.obj+bar.obj -> foobar.exe - -BOTH: compiling foobar.c -> [[OBJFOO:foobar.exe-foobar.o(bj)?]] -BOTH: linking [[OBJFOO]] -> foobar.exe - -BOTH-MULTI: compiling foo.c -> [[OBJFOO:foobar.exe-foo.o(bj)?]] -BOTH-MULTI: compiling bar.c -> [[OBJBAR:foobar.exe-bar.o(bj)?]] -BOTH-MULTI: linking [[OBJFOO]]+[[OBJBAR]] -> foobar.exe +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ +RUN: | FileCheck --check-prefix=COMPILE %s + +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ +RUN: | FileCheck --check-prefix=COMPILE-MULTI %s + +RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foo.exe foobar.obj \ +RUN: | FileCheck --check-prefix=LINK %s + +RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foobar.exe foo.obj bar.obj \ +RUN: | FileCheck --check-prefix=LINK-MULTI %s + +RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foobar.c \ +RUN: | FileCheck --check-prefix=BOTH %s + +RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foo.c bar.c \ +RUN: | FileCheck --check-prefix=BOTH-MULTI %s + + +COMPILE: compiling foobar.c -> foo.out + +COMPILE-MULTI: compiling foo.c -> foo.o{{(bj)?}} +COMPILE-MULTI: compiling bar.c -> bar.o{{(bj)?}} + + +LINK: linking foobar.obj -> foo.exe + +LINK-MULTI: linking foo.obj+bar.obj -> foobar.exe + +BOTH: compiling foobar.c -> [[OBJFOO:foobar.exe-foobar.o(bj)?]] +BOTH: linking [[OBJFOO]] -> foobar.exe + +BOTH-MULTI: compiling foo.c -> [[OBJFOO:foobar.exe-foo.o(bj)?]] +BOTH-MULTI: compiling bar.c -> [[OBJBAR:foobar.exe-bar.o(bj)?]] +BOTH-MULTI: linking [[OBJFOO]]+[[OBJBAR]] -> foobar.exe diff --git a/lldb/test/Shell/BuildScript/script-args.test b/lldb/test/Shell/BuildScript/script-args.test index 13e8a51609426735bee258eac2124f144c1cecbe..647a48e4442b1298eca2d11f9250e8ee4a44986a 100644 --- a/lldb/test/Shell/BuildScript/script-args.test +++ b/lldb/test/Shell/BuildScript/script-args.test @@ -1,32 +1,32 @@ -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ -RUN: | FileCheck %s -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ -RUN: | FileCheck --check-prefix=MULTI-INPUT %s - - -CHECK: Script Arguments: -CHECK-NEXT: Arch: 32 -CHECK: Compiler: any -CHECK: Outdir: {{.*}}script-args.test.tmp -CHECK: Output: {{.*}}script-args.test.tmp{{.}}foo.out -CHECK: Nodefaultlib: False -CHECK: Opt: none -CHECK: Mode: compile -CHECK: Clean: True -CHECK: Verbose: True -CHECK: Dryrun: True -CHECK: Inputs: foobar.c - -MULTI-INPUT: Script Arguments: -MULTI-INPUT-NEXT: Arch: 32 -MULTI-INPUT-NEXT: Compiler: any -MULTI-INPUT-NEXT: Outdir: {{.*}}script-args.test.tmp -MULTI-INPUT-NEXT: Output: -MULTI-INPUT-NEXT: Nodefaultlib: False -MULTI-INPUT-NEXT: Opt: none -MULTI-INPUT-NEXT: Mode: compile -MULTI-INPUT-NEXT: Clean: True -MULTI-INPUT-NEXT: Verbose: True -MULTI-INPUT-NEXT: Dryrun: True -MULTI-INPUT-NEXT: Inputs: foo.c -MULTI-INPUT-NEXT: bar.c +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ +RUN: | FileCheck %s +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ +RUN: | FileCheck --check-prefix=MULTI-INPUT %s + + +CHECK: Script Arguments: +CHECK-NEXT: Arch: 32 +CHECK: Compiler: any +CHECK: Outdir: {{.*}}script-args.test.tmp +CHECK: Output: {{.*}}script-args.test.tmp{{.}}foo.out +CHECK: Nodefaultlib: False +CHECK: Opt: none +CHECK: Mode: compile +CHECK: Clean: True +CHECK: Verbose: True +CHECK: Dryrun: True +CHECK: Inputs: foobar.c + +MULTI-INPUT: Script Arguments: +MULTI-INPUT-NEXT: Arch: 32 +MULTI-INPUT-NEXT: Compiler: any +MULTI-INPUT-NEXT: Outdir: {{.*}}script-args.test.tmp +MULTI-INPUT-NEXT: Output: +MULTI-INPUT-NEXT: Nodefaultlib: False +MULTI-INPUT-NEXT: Opt: none +MULTI-INPUT-NEXT: Mode: compile +MULTI-INPUT-NEXT: Clean: True +MULTI-INPUT-NEXT: Verbose: True +MULTI-INPUT-NEXT: Dryrun: True +MULTI-INPUT-NEXT: Inputs: foo.c +MULTI-INPUT-NEXT: bar.c diff --git a/lldb/test/Shell/BuildScript/toolchain-clang-cl.test b/lldb/test/Shell/BuildScript/toolchain-clang-cl.test index 8c9ea9fddb8a50a3922ca6bf274a7b8c147e8f2a..4f64859a02b60705b0339677b4bd0f1c1141fb2e 100644 --- a/lldb/test/Shell/BuildScript/toolchain-clang-cl.test +++ b/lldb/test/Shell/BuildScript/toolchain-clang-cl.test @@ -1,49 +1,49 @@ -REQUIRES: lld, system-windows - -RUN: %build -n --verbose --arch=32 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ -RUN: | FileCheck --check-prefix=CHECK-32 %s - -RUN: %build -n --verbose --arch=64 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ -RUN: | FileCheck --check-prefix=CHECK-64 %s - -CHECK-32: Script Arguments: -CHECK-32: Arch: 32 -CHECK-32: Compiler: clang-cl -CHECK-32: Outdir: {{.*}} -CHECK-32: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe -CHECK-32: Nodefaultlib: False -CHECK-32: Opt: none -CHECK-32: Mode: compile -CHECK-32: Clean: True -CHECK-32: Verbose: True -CHECK-32: Dryrun: True -CHECK-32: Inputs: foobar.c -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe -CHECK-32: compiling foobar.c -> foo.exe-foobar.obj -CHECK-32: {{.*}}clang-cl{{(\.EXE)?}} -m32 -CHECK-32: linking foo.exe-foobar.obj -> foo.exe -CHECK-32: {{.*}}lld-link{{(\.EXE)?}} - -CHECK-64: Script Arguments: -CHECK-64: Arch: 64 -CHECK-64: Compiler: clang-cl -CHECK-64: Outdir: {{.*}} -CHECK-64: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe -CHECK-64: Nodefaultlib: False -CHECK-64: Opt: none -CHECK-64: Mode: compile -CHECK-64: Clean: True -CHECK-64: Verbose: True -CHECK-64: Dryrun: True -CHECK-64: Inputs: foobar.c -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe -CHECK-64: compiling foobar.c -> foo.exe-foobar.obj -CHECK-64: {{.*}}clang-cl{{(\.EXE)?}} -m64 -CHECK-64: linking foo.exe-foobar.obj -> foo.exe -CHECK-64: {{.*}}lld-link{{(\.EXE)?}} +REQUIRES: lld, system-windows + +RUN: %build -n --verbose --arch=32 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ +RUN: | FileCheck --check-prefix=CHECK-32 %s + +RUN: %build -n --verbose --arch=64 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ +RUN: | FileCheck --check-prefix=CHECK-64 %s + +CHECK-32: Script Arguments: +CHECK-32: Arch: 32 +CHECK-32: Compiler: clang-cl +CHECK-32: Outdir: {{.*}} +CHECK-32: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe +CHECK-32: Nodefaultlib: False +CHECK-32: Opt: none +CHECK-32: Mode: compile +CHECK-32: Clean: True +CHECK-32: Verbose: True +CHECK-32: Dryrun: True +CHECK-32: Inputs: foobar.c +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe +CHECK-32: compiling foobar.c -> foo.exe-foobar.obj +CHECK-32: {{.*}}clang-cl{{(\.EXE)?}} -m32 +CHECK-32: linking foo.exe-foobar.obj -> foo.exe +CHECK-32: {{.*}}lld-link{{(\.EXE)?}} + +CHECK-64: Script Arguments: +CHECK-64: Arch: 64 +CHECK-64: Compiler: clang-cl +CHECK-64: Outdir: {{.*}} +CHECK-64: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe +CHECK-64: Nodefaultlib: False +CHECK-64: Opt: none +CHECK-64: Mode: compile +CHECK-64: Clean: True +CHECK-64: Verbose: True +CHECK-64: Dryrun: True +CHECK-64: Inputs: foobar.c +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe +CHECK-64: compiling foobar.c -> foo.exe-foobar.obj +CHECK-64: {{.*}}clang-cl{{(\.EXE)?}} -m64 +CHECK-64: linking foo.exe-foobar.obj -> foo.exe +CHECK-64: {{.*}}lld-link{{(\.EXE)?}} diff --git a/lldb/test/Shell/Commands/command-expr-diagnostics.test b/lldb/test/Shell/Commands/command-expr-diagnostics.test index b242dba1980f0ff003aee0a6806e2820e3f62a23..72df47bbbdc1edb050f29a7fc82e4cc387759d4d 100644 --- a/lldb/test/Shell/Commands/command-expr-diagnostics.test +++ b/lldb/test/Shell/Commands/command-expr-diagnostics.test @@ -1,3 +1,4 @@ +# XFAIL: system-windows # RUN: echo quit | %lldb -o "expression a+b" \ # RUN: | FileCheck %s --strict-whitespace --check-prefix=CHECK1 # (lldb) expression a+b diff --git a/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp b/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp index 6bf78b5dc43b29d6ec1b2ea7c109bb645e989e6c..d5b96472eb117f68d58f2b317a9394c84192e598 100644 --- a/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp +++ b/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp @@ -1,40 +1,40 @@ - -// nodefaultlib build: cl -Zi sigsegv.cpp /link /nodefaultlib - -#ifdef USE_CRT -#include -#else -int main(); -extern "C" -{ - int _fltused; - void mainCRTStartup() { main(); } - void printf(const char*, ...) {} -} -#endif - -void crash(bool crash_self) -{ - printf("Before...\n"); - if(crash_self) - { - printf("Crashing in 3, 2, 1 ...\n"); - *(volatile int*)nullptr = 0; - } - printf("After...\n"); -} - -int foo(int x, float y, const char* msg) -{ - bool flag = x > y; - if(flag) - printf("x = %d, y = %f, msg = %s\n", x, y, msg); - crash(flag); - return x << 1; -} - -int main() -{ - foo(10, 3.14, "testing"); -} - + +// nodefaultlib build: cl -Zi sigsegv.cpp /link /nodefaultlib + +#ifdef USE_CRT +#include +#else +int main(); +extern "C" +{ + int _fltused; + void mainCRTStartup() { main(); } + void printf(const char*, ...) {} +} +#endif + +void crash(bool crash_self) +{ + printf("Before...\n"); + if(crash_self) + { + printf("Crashing in 3, 2, 1 ...\n"); + *(volatile int*)nullptr = 0; + } + printf("After...\n"); +} + +int foo(int x, float y, const char* msg) +{ + bool flag = x > y; + if(flag) + printf("x = %d, y = %f, msg = %s\n", x, y, msg); + crash(flag); + return x << 1; +} + +int main() +{ + foo(10, 3.14, "testing"); +} + diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp similarity index 89% rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp index 1488199a3ad2d3197c2a3ffd846bc94ca38fe070..e198bf0cafeaacfc98810bee5fa1ed433a95bd18 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp @@ -1,6 +1,6 @@ // XFAIL: * -// RUN: %clangxx_host -gdwarf -o %t %s +// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s // RUN: %lldb %t \ // RUN: -o "expr alignof(OverlappingFields)" \ // RUN: -o "expr sizeof(OverlappingFields)" \ diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp similarity index 91% rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp index 15d8de0e3ee988b67f3c05737ed1d4a9357d0c5c..c4bcfc473277f6938bc4149fecd576fb204bdc27 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp @@ -1,6 +1,6 @@ // XFAIL: * -// RUN: %clangxx_host -gdwarf -o %t %s +// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s // RUN: %lldb %t \ // RUN: -o "expr alignof(OverlappingDerived)" \ // RUN: -o "expr sizeof(OverlappingDerived)" \ diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s index aac8f4c169803842b652991b150de64fe970120e..a9d248758bfcec390ae8b92ba43d0d20b0d26a16 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s @@ -1,622 +1,622 @@ -# Compiled from the following files, but replaced the call to abort with nop. -# clang-cl -fuse-ld=lld-link /Z7 /O1 /Faa.asm /winsysroot~/win_toolchain a.cpp -# a.cpp: -# #include "a.h" -# int main(int argc, char** argv) { -# volatile int main_local = Namespace1::foo(2); -# return 0; -# } -# a.h: -# #include -# #include "b.h" -# namespace Namespace1 { -# inline int foo(int x) { -# volatile int foo_local = x + 1; -# ++foo_local; -# if (!foo_local) -# abort(); -# return Class1::bar(foo_local); -# } -# } // namespace Namespace1 -# b.h: -# #include "c.h" -# class Class1 { -# public: -# inline static int bar(int x) { -# volatile int bar_local = x + 1; -# ++bar_local; -# return Namespace2::Class2::func(bar_local); -# } -# }; -# c.h: -# namespace Namespace2 { -# class Class2 { -# public: -# inline static int func(int x) { -# volatile int func_local = x + 1; -# func_local += x; -# return func_local; -# } -# }; -# } // namespace Namespace2 - - .text - .def @feat.00; - .scl 3; - .type 0; - .endef - .globl @feat.00 -.set @feat.00, 0 - .intel_syntax noprefix - .file "a.cpp" - .def main; - .scl 2; - .type 32; - .endef - .section .text,"xr",one_only,main - .globl main # -- Begin function main -main: # @main -.Lfunc_begin0: - .cv_func_id 0 - .cv_file 1 "/tmp/a.cpp" "4FFB96E5DF1A95CE7DB9732CFFE001D7" 1 - .cv_loc 0 1 2 0 # a.cpp:2:0 -.seh_proc main -# %bb.0: - #DEBUG_VALUE: main:argv <- $rdx - #DEBUG_VALUE: main:argc <- $ecx - #DEBUG_VALUE: foo:x <- 2 - sub rsp, 56 - .seh_stackalloc 56 - .seh_endprologue -.Ltmp0: - .cv_file 2 "/tmp/./a.h" "BBFED90EF093E9C1D032CC9B05B5D167" 1 - .cv_inline_site_id 1 within 0 inlined_at 1 3 0 - .cv_loc 1 2 5 0 # ./a.h:5:0 - mov dword ptr [rsp + 44], 3 - .cv_loc 1 2 6 0 # ./a.h:6:0 - inc dword ptr [rsp + 44] - .cv_loc 1 2 7 0 # ./a.h:7:0 - mov eax, dword ptr [rsp + 44] - test eax, eax - je .LBB0_2 -.Ltmp1: -# %bb.1: - #DEBUG_VALUE: main:argv <- $rdx - #DEBUG_VALUE: main:argc <- $ecx - #DEBUG_VALUE: foo:x <- 2 - .cv_loc 1 2 9 0 # ./a.h:9:0 - mov eax, dword ptr [rsp + 44] -.Ltmp2: - #DEBUG_VALUE: bar:x <- $eax - .cv_file 3 "/tmp/./b.h" "A26CC743A260115F33AF91AB11F95877" 1 - .cv_inline_site_id 2 within 1 inlined_at 2 9 0 - .cv_loc 2 3 5 0 # ./b.h:5:0 - inc eax -.Ltmp3: - mov dword ptr [rsp + 52], eax - .cv_loc 2 3 6 0 # ./b.h:6:0 - inc dword ptr [rsp + 52] - .cv_loc 2 3 7 0 # ./b.h:7:0 - mov eax, dword ptr [rsp + 52] -.Ltmp4: - #DEBUG_VALUE: func:x <- $eax - .cv_file 4 "/tmp/./c.h" "8AF4613F78624BBE96D1C408ABA39B2D" 1 - .cv_inline_site_id 3 within 2 inlined_at 3 7 0 - .cv_loc 3 4 5 0 # ./c.h:5:0 - lea ecx, [rax + 1] -.Ltmp5: - #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx - mov dword ptr [rsp + 48], ecx - .cv_loc 3 4 6 0 # ./c.h:6:0 - add dword ptr [rsp + 48], eax - .cv_loc 3 4 7 0 # ./c.h:7:0 - mov eax, dword ptr [rsp + 48] -.Ltmp6: - .cv_loc 0 1 3 0 # a.cpp:3:0 - mov dword ptr [rsp + 48], eax - .cv_loc 0 1 4 0 # a.cpp:4:0 - xor eax, eax - # Use fake debug info to tests inline info. - .cv_loc 1 2 20 0 - add rsp, 56 - ret -.Ltmp7: -.LBB0_2: - #DEBUG_VALUE: main:argv <- $rdx - #DEBUG_VALUE: main:argc <- $ecx - #DEBUG_VALUE: foo:x <- 2 - .cv_loc 1 2 8 0 # ./a.h:8:0 - nop -.Ltmp8: - int3 -.Ltmp9: - #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx - #DEBUG_VALUE: main:argv <- [DW_OP_LLVM_entry_value 1] $rdx -.Lfunc_end0: - .seh_endproc - # -- End function - .section .drectve,"yn" - .ascii " /DEFAULTLIB:libcmt.lib" - .ascii " /DEFAULTLIB:oldnames.lib" - .section .debug$S,"dr" - .p2align 2 - .long 4 # Debug section magic - .long 241 - .long .Ltmp11-.Ltmp10 # Subsection size -.Ltmp10: - .short .Ltmp13-.Ltmp12 # Record length -.Ltmp12: - .short 4353 # Record kind: S_OBJNAME - .long 0 # Signature - .asciz "/tmp/a-2b2ba0.obj" # Object name - .p2align 2 -.Ltmp13: - .short .Ltmp15-.Ltmp14 # Record length -.Ltmp14: - .short 4412 # Record kind: S_COMPILE3 - .long 1 # Flags and language - .short 208 # CPUType - .short 15 # Frontend version - .short 0 - .short 0 - .short 0 - .short 15000 # Backend version - .short 0 - .short 0 - .short 0 - .asciz "clang version 15.0.0" # Null-terminated compiler version string - .p2align 2 -.Ltmp15: -.Ltmp11: - .p2align 2 - .long 246 # Inlinee lines subsection - .long .Ltmp17-.Ltmp16 # Subsection size -.Ltmp16: - .long 0 # Inlinee lines signature - - # Inlined function foo starts at ./a.h:4 - .long 4099 # Type index of inlined function - .cv_filechecksumoffset 2 # Offset into filechecksum table - .long 4 # Starting line number - - # Inlined function bar starts at ./b.h:4 - .long 4106 # Type index of inlined function - .cv_filechecksumoffset 3 # Offset into filechecksum table - .long 4 # Starting line number - - # Inlined function func starts at ./c.h:4 - .long 4113 # Type index of inlined function - .cv_filechecksumoffset 4 # Offset into filechecksum table - .long 4 # Starting line number -.Ltmp17: - .p2align 2 - .section .debug$S,"dr",associative,main - .p2align 2 - .long 4 # Debug section magic - .long 241 # Symbol subsection for main - .long .Ltmp19-.Ltmp18 # Subsection size -.Ltmp18: - .short .Ltmp21-.Ltmp20 # Record length -.Ltmp20: - .short 4423 # Record kind: S_GPROC32_ID - .long 0 # PtrParent - .long 0 # PtrEnd - .long 0 # PtrNext - .long .Lfunc_end0-main # Code size - .long 0 # Offset after prologue - .long 0 # Offset before epilogue - .long 4117 # Function type index - .secrel32 main # Function section relative address - .secidx main # Function section index - .byte 0 # Flags - .asciz "main" # Function name - .p2align 2 -.Ltmp21: - .short .Ltmp23-.Ltmp22 # Record length -.Ltmp22: - .short 4114 # Record kind: S_FRAMEPROC - .long 56 # FrameSize - .long 0 # Padding - .long 0 # Offset of padding - .long 0 # Bytes of callee saved registers - .long 0 # Exception handler offset - .short 0 # Exception handler section - .long 81920 # Flags (defines frame register) - .p2align 2 -.Ltmp23: - .short .Ltmp25-.Ltmp24 # Record length -.Ltmp24: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 1 # Flags - .asciz "argc" - .p2align 2 -.Ltmp25: - .cv_def_range .Lfunc_begin0 .Ltmp5 .Ltmp7 .Ltmp8, reg, 18 - .short .Ltmp27-.Ltmp26 # Record length -.Ltmp26: - .short 4414 # Record kind: S_LOCAL - .long 4114 # TypeIndex - .short 1 # Flags - .asciz "argv" - .p2align 2 -.Ltmp27: - .cv_def_range .Lfunc_begin0 .Ltmp8, reg, 331 - .short .Ltmp29-.Ltmp28 # Record length -.Ltmp28: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "main_local" - .p2align 2 -.Ltmp29: - .cv_def_range .Ltmp0 .Ltmp9, frame_ptr_rel, 48 - .short .Ltmp31-.Ltmp30 # Record length -.Ltmp30: - .short 4429 # Record kind: S_INLINESITE - .long 0 # PtrParent - .long 0 # PtrEnd - .long 4099 # Inlinee type index - .cv_inline_linetable 1 2 4 .Lfunc_begin0 .Lfunc_end0 - .p2align 2 -.Ltmp31: - .short .Ltmp33-.Ltmp32 # Record length -.Ltmp32: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 257 # Flags - .asciz "x" - .p2align 2 -.Ltmp33: - .short .Ltmp35-.Ltmp34 # Record length -.Ltmp34: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "foo_local" - .p2align 2 -.Ltmp35: - .cv_def_range .Ltmp0 .Ltmp6 .Ltmp7 .Ltmp9, frame_ptr_rel, 44 - .short .Ltmp37-.Ltmp36 # Record length -.Ltmp36: - .short 4429 # Record kind: S_INLINESITE - .long 0 # PtrParent - .long 0 # PtrEnd - .long 4106 # Inlinee type index - .cv_inline_linetable 2 3 4 .Lfunc_begin0 .Lfunc_end0 - .p2align 2 -.Ltmp37: - .short .Ltmp39-.Ltmp38 # Record length -.Ltmp38: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 1 # Flags - .asciz "x" - .p2align 2 -.Ltmp39: - .cv_def_range .Ltmp2 .Ltmp3, reg, 17 - .short .Ltmp41-.Ltmp40 # Record length -.Ltmp40: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "bar_local" - .p2align 2 -.Ltmp41: - .cv_def_range .Ltmp2 .Ltmp6, frame_ptr_rel, 52 - .short .Ltmp43-.Ltmp42 # Record length -.Ltmp42: - .short 4429 # Record kind: S_INLINESITE - .long 0 # PtrParent - .long 0 # PtrEnd - .long 4113 # Inlinee type index - .cv_inline_linetable 3 4 4 .Lfunc_begin0 .Lfunc_end0 - .p2align 2 -.Ltmp43: - .short .Ltmp45-.Ltmp44 # Record length -.Ltmp44: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 1 # Flags - .asciz "x" - .p2align 2 -.Ltmp45: - .cv_def_range .Ltmp4 .Ltmp6, reg, 17 - .short .Ltmp47-.Ltmp46 # Record length -.Ltmp46: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "func_local" - .p2align 2 -.Ltmp47: - .cv_def_range .Ltmp4 .Ltmp6, frame_ptr_rel, 48 - .short 2 # Record length - .short 4430 # Record kind: S_INLINESITE_END - .short 2 # Record length - .short 4430 # Record kind: S_INLINESITE_END - .short 2 # Record length - .short 4430 # Record kind: S_INLINESITE_END - .short 2 # Record length - .short 4431 # Record kind: S_PROC_ID_END -.Ltmp19: - .p2align 2 - .cv_linetable 0, main, .Lfunc_end0 - .section .debug$S,"dr" - .long 241 - .long .Ltmp49-.Ltmp48 # Subsection size -.Ltmp48: - .short .Ltmp51-.Ltmp50 # Record length -.Ltmp50: - .short 4360 # Record kind: S_UDT - .long 4103 # Type - .asciz "Class1" - .p2align 2 -.Ltmp51: - .short .Ltmp53-.Ltmp52 # Record length -.Ltmp52: - .short 4360 # Record kind: S_UDT - .long 4110 # Type - .asciz "Namespace2::Class2" - .p2align 2 -.Ltmp53: -.Ltmp49: - .p2align 2 - .cv_filechecksums # File index to string table offset subsection - .cv_stringtable # String table - .long 241 - .long .Ltmp55-.Ltmp54 # Subsection size -.Ltmp54: - .short .Ltmp57-.Ltmp56 # Record length -.Ltmp56: - .short 4428 # Record kind: S_BUILDINFO - .long 4124 # LF_BUILDINFO index - .p2align 2 -.Ltmp57: -.Ltmp55: - .p2align 2 - .section .debug$T,"dr" - .p2align 2 - .long 4 # Debug section magic - # StringId (0x1000) - .short 0x12 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "Namespace1" # StringData - .byte 241 - # ArgList (0x1001) - .short 0xa # Record length - .short 0x1201 # Record kind: LF_ARGLIST - .long 0x1 # NumArgs - .long 0x74 # Argument: int - # Procedure (0x1002) - .short 0xe # Record length - .short 0x1008 # Record kind: LF_PROCEDURE - .long 0x74 # ReturnType: int - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x1 # NumParameters - .long 0x1001 # ArgListType: (int) - # FuncId (0x1003) - .short 0xe # Record length - .short 0x1601 # Record kind: LF_FUNC_ID - .long 0x1000 # ParentScope: Namespace1 - .long 0x1002 # FunctionType: int (int) - .asciz "foo" # Name - # Class (0x1004) - .short 0x2a # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x0 # MemberCount - .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) - .long 0x0 # FieldList - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x0 # SizeOf - .asciz "Class1" # Name - .asciz ".?AVClass1@@" # LinkageName - .byte 242 - .byte 241 - # MemberFunction (0x1005) - .short 0x1a # Record length - .short 0x1009 # Record kind: LF_MFUNCTION - .long 0x74 # ReturnType: int - .long 0x1004 # ClassType: Class1 - .long 0x0 # ThisType - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x1 # NumParameters - .long 0x1001 # ArgListType: (int) - .long 0x0 # ThisAdjustment - # FieldList (0x1006) - .short 0xe # Record length - .short 0x1203 # Record kind: LF_FIELDLIST - .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) - .short 0xb # Attrs: Public, Static - .long 0x1005 # Type: int Class1::(int) - .asciz "bar" # Name - # Class (0x1007) - .short 0x2a # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x1 # MemberCount - .short 0x200 # Properties ( HasUniqueName (0x200) ) - .long 0x1006 # FieldList: - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x1 # SizeOf - .asciz "Class1" # Name - .asciz ".?AVClass1@@" # LinkageName - .byte 242 - .byte 241 - # StringId (0x1008) - .short 0x12 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/tmp/./b.h" # StringData - .byte 241 - # UdtSourceLine (0x1009) - .short 0xe # Record length - .short 0x1606 # Record kind: LF_UDT_SRC_LINE - .long 0x1007 # UDT: Class1 - .long 0x1008 # SourceFile: /tmp/./b.h - .long 0x2 # LineNumber - # MemberFuncId (0x100A) - .short 0xe # Record length - .short 0x1602 # Record kind: LF_MFUNC_ID - .long 0x1004 # ClassType: Class1 - .long 0x1005 # FunctionType: int Class1::(int) - .asciz "bar" # Name - # Class (0x100B) - .short 0x42 # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x0 # MemberCount - .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) - .long 0x0 # FieldList - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x0 # SizeOf - .asciz "Namespace2::Class2" # Name - .asciz ".?AVClass2@Namespace2@@" # LinkageName - .byte 243 - .byte 242 - .byte 241 - # MemberFunction (0x100C) - .short 0x1a # Record length - .short 0x1009 # Record kind: LF_MFUNCTION - .long 0x74 # ReturnType: int - .long 0x100b # ClassType: Namespace2::Class2 - .long 0x0 # ThisType - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x1 # NumParameters - .long 0x1001 # ArgListType: (int) - .long 0x0 # ThisAdjustment - # FieldList (0x100D) - .short 0x12 # Record length - .short 0x1203 # Record kind: LF_FIELDLIST - .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) - .short 0xb # Attrs: Public, Static - .long 0x100c # Type: int Namespace2::Class2::(int) - .asciz "func" # Name - .byte 243 - .byte 242 - .byte 241 - # Class (0x100E) - .short 0x42 # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x1 # MemberCount - .short 0x200 # Properties ( HasUniqueName (0x200) ) - .long 0x100d # FieldList: - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x1 # SizeOf - .asciz "Namespace2::Class2" # Name - .asciz ".?AVClass2@Namespace2@@" # LinkageName - .byte 243 - .byte 242 - .byte 241 - # StringId (0x100F) - .short 0x12 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/tmp/./c.h" # StringData - .byte 241 - # UdtSourceLine (0x1010) - .short 0xe # Record length - .short 0x1606 # Record kind: LF_UDT_SRC_LINE - .long 0x100e # UDT: Namespace2::Class2 - .long 0x100f # SourceFile: /tmp/./c.h - .long 0x2 # LineNumber - # MemberFuncId (0x1011) - .short 0x12 # Record length - .short 0x1602 # Record kind: LF_MFUNC_ID - .long 0x100b # ClassType: Namespace2::Class2 - .long 0x100c # FunctionType: int Namespace2::Class2::(int) - .asciz "func" # Name - .byte 243 - .byte 242 - .byte 241 - # Pointer (0x1012) - .short 0xa # Record length - .short 0x1002 # Record kind: LF_POINTER - .long 0x670 # PointeeType: char* - .long 0x1000c # Attrs: [ Type: Near64, Mode: Pointer, SizeOf: 8 ] - # ArgList (0x1013) - .short 0xe # Record length - .short 0x1201 # Record kind: LF_ARGLIST - .long 0x2 # NumArgs - .long 0x74 # Argument: int - .long 0x1012 # Argument: char** - # Procedure (0x1014) - .short 0xe # Record length - .short 0x1008 # Record kind: LF_PROCEDURE - .long 0x74 # ReturnType: int - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x2 # NumParameters - .long 0x1013 # ArgListType: (int, char**) - # FuncId (0x1015) - .short 0x12 # Record length - .short 0x1601 # Record kind: LF_FUNC_ID - .long 0x0 # ParentScope - .long 0x1014 # FunctionType: int (int, char**) - .asciz "main" # Name - .byte 243 - .byte 242 - .byte 241 - # Modifier (0x1016) - .short 0xa # Record length - .short 0x1001 # Record kind: LF_MODIFIER - .long 0x74 # ModifiedType: int - .short 0x2 # Modifiers ( Volatile (0x2) ) - .byte 242 - .byte 241 - # StringId (0x1017) - .short 0xe # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/tmp" # StringData - .byte 243 - .byte 242 - .byte 241 - # StringId (0x1018) - .short 0xe # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "a.cpp" # StringData - .byte 242 - .byte 241 - # StringId (0x1019) - .short 0xa # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .byte 0 # StringData - .byte 243 - .byte 242 - .byte 241 - # StringId (0x101A) - .short 0x4e # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang" # StringData - .byte 243 - .byte 242 - .byte 241 - # StringId (0x101B) - .short 0x9f6 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "\"-cc1\" \"-triple\" \"x86_64-pc-windows-msvc19.20.0\" \"-S\" \"-disable-free\" \"-clear-ast-before-backend\" \"-disable-llvm-verifier\" \"-discard-value-names\" \"-mrelocation-model\" \"pic\" \"-pic-level\" \"2\" \"-mframe-pointer=none\" \"-relaxed-aliasing\" \"-fmath-errno\" \"-ffp-contract=on\" \"-fno-rounding-math\" \"-mconstructor-aliases\" \"-funwind-tables=2\" \"-target-cpu\" \"x86-64\" \"-mllvm\" \"-x86-asm-syntax=intel\" \"-tune-cpu\" \"generic\" \"-mllvm\" \"-treat-scalable-fixed-error-as-warning\" \"-D_MT\" \"-flto-visibility-public-std\" \"--dependent-lib=libcmt\" \"--dependent-lib=oldnames\" \"-stack-protector\" \"2\" \"-fms-volatile\" \"-fdiagnostics-format\" \"msvc\" \"-gno-column-info\" \"-gcodeview\" \"-debug-info-kind=constructor\" \"-ffunction-sections\" \"-fcoverage-compilation-dir=/tmp\" \"-resource-dir\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt\" \"-Os\" \"-fdeprecated-macro\" \"-fdebug-compilation-dir=/tmp\" \"-ferror-limit\" \"19\" \"-fno-use-cxa-atexit\" \"-fms-extensions\" \"-fms-compatibility\" \"-fms-compatibility-version=19.20\" \"-std=c++14\" \"-fdelayed-template-parsing\" \"-fcolor-diagnostics\" \"-vectorize-loops\" \"-vectorize-slp\" \"-faddrsig\" \"-x\" \"c++\"" # StringData - .byte 242 - .byte 241 - # BuildInfo (0x101C) - .short 0x1a # Record length - .short 0x1603 # Record kind: LF_BUILDINFO - .short 0x5 # NumArgs - .long 0x1017 # Argument: /tmp - .long 0x101a # Argument: /usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang - .long 0x1018 # Argument: a.cpp - .long 0x1019 # Argument - .long 0x101b # Argument: "-cc1" "-triple" "x86_64-pc-windows-msvc19.20.0" "-S" "-disable-free" "-clear-ast-before-backend" "-disable-llvm-verifier" "-discard-value-names" "-mrelocation-model" "pic" "-pic-level" "2" "-mframe-pointer=none" "-relaxed-aliasing" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-mllvm" "-x86-asm-syntax=intel" "-tune-cpu" "generic" "-mllvm" "-treat-scalable-fixed-error-as-warning" "-D_MT" "-flto-visibility-public-std" "--dependent-lib=libcmt" "--dependent-lib=oldnames" "-stack-protector" "2" "-fms-volatile" "-fdiagnostics-format" "msvc" "-gno-column-info" "-gcodeview" "-debug-info-kind=constructor" "-ffunction-sections" "-fcoverage-compilation-dir=/tmp" "-resource-dir" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0" "-internal-isystem" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt" "-Os" "-fdeprecated-macro" "-fdebug-compilation-dir=/tmp" "-ferror-limit" "19" "-fno-use-cxa-atexit" "-fms-extensions" "-fms-compatibility" "-fms-compatibility-version=19.20" "-std=c++14" "-fdelayed-template-parsing" "-fcolor-diagnostics" "-vectorize-loops" "-vectorize-slp" "-faddrsig" "-x" "c++" - .byte 242 - .byte 241 - .addrsig +# Compiled from the following files, but replaced the call to abort with nop. +# clang-cl -fuse-ld=lld-link /Z7 /O1 /Faa.asm /winsysroot~/win_toolchain a.cpp +# a.cpp: +# #include "a.h" +# int main(int argc, char** argv) { +# volatile int main_local = Namespace1::foo(2); +# return 0; +# } +# a.h: +# #include +# #include "b.h" +# namespace Namespace1 { +# inline int foo(int x) { +# volatile int foo_local = x + 1; +# ++foo_local; +# if (!foo_local) +# abort(); +# return Class1::bar(foo_local); +# } +# } // namespace Namespace1 +# b.h: +# #include "c.h" +# class Class1 { +# public: +# inline static int bar(int x) { +# volatile int bar_local = x + 1; +# ++bar_local; +# return Namespace2::Class2::func(bar_local); +# } +# }; +# c.h: +# namespace Namespace2 { +# class Class2 { +# public: +# inline static int func(int x) { +# volatile int func_local = x + 1; +# func_local += x; +# return func_local; +# } +# }; +# } // namespace Namespace2 + + .text + .def @feat.00; + .scl 3; + .type 0; + .endef + .globl @feat.00 +.set @feat.00, 0 + .intel_syntax noprefix + .file "a.cpp" + .def main; + .scl 2; + .type 32; + .endef + .section .text,"xr",one_only,main + .globl main # -- Begin function main +main: # @main +.Lfunc_begin0: + .cv_func_id 0 + .cv_file 1 "/tmp/a.cpp" "4FFB96E5DF1A95CE7DB9732CFFE001D7" 1 + .cv_loc 0 1 2 0 # a.cpp:2:0 +.seh_proc main +# %bb.0: + #DEBUG_VALUE: main:argv <- $rdx + #DEBUG_VALUE: main:argc <- $ecx + #DEBUG_VALUE: foo:x <- 2 + sub rsp, 56 + .seh_stackalloc 56 + .seh_endprologue +.Ltmp0: + .cv_file 2 "/tmp/./a.h" "BBFED90EF093E9C1D032CC9B05B5D167" 1 + .cv_inline_site_id 1 within 0 inlined_at 1 3 0 + .cv_loc 1 2 5 0 # ./a.h:5:0 + mov dword ptr [rsp + 44], 3 + .cv_loc 1 2 6 0 # ./a.h:6:0 + inc dword ptr [rsp + 44] + .cv_loc 1 2 7 0 # ./a.h:7:0 + mov eax, dword ptr [rsp + 44] + test eax, eax + je .LBB0_2 +.Ltmp1: +# %bb.1: + #DEBUG_VALUE: main:argv <- $rdx + #DEBUG_VALUE: main:argc <- $ecx + #DEBUG_VALUE: foo:x <- 2 + .cv_loc 1 2 9 0 # ./a.h:9:0 + mov eax, dword ptr [rsp + 44] +.Ltmp2: + #DEBUG_VALUE: bar:x <- $eax + .cv_file 3 "/tmp/./b.h" "A26CC743A260115F33AF91AB11F95877" 1 + .cv_inline_site_id 2 within 1 inlined_at 2 9 0 + .cv_loc 2 3 5 0 # ./b.h:5:0 + inc eax +.Ltmp3: + mov dword ptr [rsp + 52], eax + .cv_loc 2 3 6 0 # ./b.h:6:0 + inc dword ptr [rsp + 52] + .cv_loc 2 3 7 0 # ./b.h:7:0 + mov eax, dword ptr [rsp + 52] +.Ltmp4: + #DEBUG_VALUE: func:x <- $eax + .cv_file 4 "/tmp/./c.h" "8AF4613F78624BBE96D1C408ABA39B2D" 1 + .cv_inline_site_id 3 within 2 inlined_at 3 7 0 + .cv_loc 3 4 5 0 # ./c.h:5:0 + lea ecx, [rax + 1] +.Ltmp5: + #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx + mov dword ptr [rsp + 48], ecx + .cv_loc 3 4 6 0 # ./c.h:6:0 + add dword ptr [rsp + 48], eax + .cv_loc 3 4 7 0 # ./c.h:7:0 + mov eax, dword ptr [rsp + 48] +.Ltmp6: + .cv_loc 0 1 3 0 # a.cpp:3:0 + mov dword ptr [rsp + 48], eax + .cv_loc 0 1 4 0 # a.cpp:4:0 + xor eax, eax + # Use fake debug info to tests inline info. + .cv_loc 1 2 20 0 + add rsp, 56 + ret +.Ltmp7: +.LBB0_2: + #DEBUG_VALUE: main:argv <- $rdx + #DEBUG_VALUE: main:argc <- $ecx + #DEBUG_VALUE: foo:x <- 2 + .cv_loc 1 2 8 0 # ./a.h:8:0 + nop +.Ltmp8: + int3 +.Ltmp9: + #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx + #DEBUG_VALUE: main:argv <- [DW_OP_LLVM_entry_value 1] $rdx +.Lfunc_end0: + .seh_endproc + # -- End function + .section .drectve,"yn" + .ascii " /DEFAULTLIB:libcmt.lib" + .ascii " /DEFAULTLIB:oldnames.lib" + .section .debug$S,"dr" + .p2align 2 + .long 4 # Debug section magic + .long 241 + .long .Ltmp11-.Ltmp10 # Subsection size +.Ltmp10: + .short .Ltmp13-.Ltmp12 # Record length +.Ltmp12: + .short 4353 # Record kind: S_OBJNAME + .long 0 # Signature + .asciz "/tmp/a-2b2ba0.obj" # Object name + .p2align 2 +.Ltmp13: + .short .Ltmp15-.Ltmp14 # Record length +.Ltmp14: + .short 4412 # Record kind: S_COMPILE3 + .long 1 # Flags and language + .short 208 # CPUType + .short 15 # Frontend version + .short 0 + .short 0 + .short 0 + .short 15000 # Backend version + .short 0 + .short 0 + .short 0 + .asciz "clang version 15.0.0" # Null-terminated compiler version string + .p2align 2 +.Ltmp15: +.Ltmp11: + .p2align 2 + .long 246 # Inlinee lines subsection + .long .Ltmp17-.Ltmp16 # Subsection size +.Ltmp16: + .long 0 # Inlinee lines signature + + # Inlined function foo starts at ./a.h:4 + .long 4099 # Type index of inlined function + .cv_filechecksumoffset 2 # Offset into filechecksum table + .long 4 # Starting line number + + # Inlined function bar starts at ./b.h:4 + .long 4106 # Type index of inlined function + .cv_filechecksumoffset 3 # Offset into filechecksum table + .long 4 # Starting line number + + # Inlined function func starts at ./c.h:4 + .long 4113 # Type index of inlined function + .cv_filechecksumoffset 4 # Offset into filechecksum table + .long 4 # Starting line number +.Ltmp17: + .p2align 2 + .section .debug$S,"dr",associative,main + .p2align 2 + .long 4 # Debug section magic + .long 241 # Symbol subsection for main + .long .Ltmp19-.Ltmp18 # Subsection size +.Ltmp18: + .short .Ltmp21-.Ltmp20 # Record length +.Ltmp20: + .short 4423 # Record kind: S_GPROC32_ID + .long 0 # PtrParent + .long 0 # PtrEnd + .long 0 # PtrNext + .long .Lfunc_end0-main # Code size + .long 0 # Offset after prologue + .long 0 # Offset before epilogue + .long 4117 # Function type index + .secrel32 main # Function section relative address + .secidx main # Function section index + .byte 0 # Flags + .asciz "main" # Function name + .p2align 2 +.Ltmp21: + .short .Ltmp23-.Ltmp22 # Record length +.Ltmp22: + .short 4114 # Record kind: S_FRAMEPROC + .long 56 # FrameSize + .long 0 # Padding + .long 0 # Offset of padding + .long 0 # Bytes of callee saved registers + .long 0 # Exception handler offset + .short 0 # Exception handler section + .long 81920 # Flags (defines frame register) + .p2align 2 +.Ltmp23: + .short .Ltmp25-.Ltmp24 # Record length +.Ltmp24: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 1 # Flags + .asciz "argc" + .p2align 2 +.Ltmp25: + .cv_def_range .Lfunc_begin0 .Ltmp5 .Ltmp7 .Ltmp8, reg, 18 + .short .Ltmp27-.Ltmp26 # Record length +.Ltmp26: + .short 4414 # Record kind: S_LOCAL + .long 4114 # TypeIndex + .short 1 # Flags + .asciz "argv" + .p2align 2 +.Ltmp27: + .cv_def_range .Lfunc_begin0 .Ltmp8, reg, 331 + .short .Ltmp29-.Ltmp28 # Record length +.Ltmp28: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "main_local" + .p2align 2 +.Ltmp29: + .cv_def_range .Ltmp0 .Ltmp9, frame_ptr_rel, 48 + .short .Ltmp31-.Ltmp30 # Record length +.Ltmp30: + .short 4429 # Record kind: S_INLINESITE + .long 0 # PtrParent + .long 0 # PtrEnd + .long 4099 # Inlinee type index + .cv_inline_linetable 1 2 4 .Lfunc_begin0 .Lfunc_end0 + .p2align 2 +.Ltmp31: + .short .Ltmp33-.Ltmp32 # Record length +.Ltmp32: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 257 # Flags + .asciz "x" + .p2align 2 +.Ltmp33: + .short .Ltmp35-.Ltmp34 # Record length +.Ltmp34: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "foo_local" + .p2align 2 +.Ltmp35: + .cv_def_range .Ltmp0 .Ltmp6 .Ltmp7 .Ltmp9, frame_ptr_rel, 44 + .short .Ltmp37-.Ltmp36 # Record length +.Ltmp36: + .short 4429 # Record kind: S_INLINESITE + .long 0 # PtrParent + .long 0 # PtrEnd + .long 4106 # Inlinee type index + .cv_inline_linetable 2 3 4 .Lfunc_begin0 .Lfunc_end0 + .p2align 2 +.Ltmp37: + .short .Ltmp39-.Ltmp38 # Record length +.Ltmp38: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 1 # Flags + .asciz "x" + .p2align 2 +.Ltmp39: + .cv_def_range .Ltmp2 .Ltmp3, reg, 17 + .short .Ltmp41-.Ltmp40 # Record length +.Ltmp40: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "bar_local" + .p2align 2 +.Ltmp41: + .cv_def_range .Ltmp2 .Ltmp6, frame_ptr_rel, 52 + .short .Ltmp43-.Ltmp42 # Record length +.Ltmp42: + .short 4429 # Record kind: S_INLINESITE + .long 0 # PtrParent + .long 0 # PtrEnd + .long 4113 # Inlinee type index + .cv_inline_linetable 3 4 4 .Lfunc_begin0 .Lfunc_end0 + .p2align 2 +.Ltmp43: + .short .Ltmp45-.Ltmp44 # Record length +.Ltmp44: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 1 # Flags + .asciz "x" + .p2align 2 +.Ltmp45: + .cv_def_range .Ltmp4 .Ltmp6, reg, 17 + .short .Ltmp47-.Ltmp46 # Record length +.Ltmp46: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "func_local" + .p2align 2 +.Ltmp47: + .cv_def_range .Ltmp4 .Ltmp6, frame_ptr_rel, 48 + .short 2 # Record length + .short 4430 # Record kind: S_INLINESITE_END + .short 2 # Record length + .short 4430 # Record kind: S_INLINESITE_END + .short 2 # Record length + .short 4430 # Record kind: S_INLINESITE_END + .short 2 # Record length + .short 4431 # Record kind: S_PROC_ID_END +.Ltmp19: + .p2align 2 + .cv_linetable 0, main, .Lfunc_end0 + .section .debug$S,"dr" + .long 241 + .long .Ltmp49-.Ltmp48 # Subsection size +.Ltmp48: + .short .Ltmp51-.Ltmp50 # Record length +.Ltmp50: + .short 4360 # Record kind: S_UDT + .long 4103 # Type + .asciz "Class1" + .p2align 2 +.Ltmp51: + .short .Ltmp53-.Ltmp52 # Record length +.Ltmp52: + .short 4360 # Record kind: S_UDT + .long 4110 # Type + .asciz "Namespace2::Class2" + .p2align 2 +.Ltmp53: +.Ltmp49: + .p2align 2 + .cv_filechecksums # File index to string table offset subsection + .cv_stringtable # String table + .long 241 + .long .Ltmp55-.Ltmp54 # Subsection size +.Ltmp54: + .short .Ltmp57-.Ltmp56 # Record length +.Ltmp56: + .short 4428 # Record kind: S_BUILDINFO + .long 4124 # LF_BUILDINFO index + .p2align 2 +.Ltmp57: +.Ltmp55: + .p2align 2 + .section .debug$T,"dr" + .p2align 2 + .long 4 # Debug section magic + # StringId (0x1000) + .short 0x12 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "Namespace1" # StringData + .byte 241 + # ArgList (0x1001) + .short 0xa # Record length + .short 0x1201 # Record kind: LF_ARGLIST + .long 0x1 # NumArgs + .long 0x74 # Argument: int + # Procedure (0x1002) + .short 0xe # Record length + .short 0x1008 # Record kind: LF_PROCEDURE + .long 0x74 # ReturnType: int + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x1 # NumParameters + .long 0x1001 # ArgListType: (int) + # FuncId (0x1003) + .short 0xe # Record length + .short 0x1601 # Record kind: LF_FUNC_ID + .long 0x1000 # ParentScope: Namespace1 + .long 0x1002 # FunctionType: int (int) + .asciz "foo" # Name + # Class (0x1004) + .short 0x2a # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x0 # MemberCount + .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) + .long 0x0 # FieldList + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x0 # SizeOf + .asciz "Class1" # Name + .asciz ".?AVClass1@@" # LinkageName + .byte 242 + .byte 241 + # MemberFunction (0x1005) + .short 0x1a # Record length + .short 0x1009 # Record kind: LF_MFUNCTION + .long 0x74 # ReturnType: int + .long 0x1004 # ClassType: Class1 + .long 0x0 # ThisType + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x1 # NumParameters + .long 0x1001 # ArgListType: (int) + .long 0x0 # ThisAdjustment + # FieldList (0x1006) + .short 0xe # Record length + .short 0x1203 # Record kind: LF_FIELDLIST + .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) + .short 0xb # Attrs: Public, Static + .long 0x1005 # Type: int Class1::(int) + .asciz "bar" # Name + # Class (0x1007) + .short 0x2a # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x1 # MemberCount + .short 0x200 # Properties ( HasUniqueName (0x200) ) + .long 0x1006 # FieldList: + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x1 # SizeOf + .asciz "Class1" # Name + .asciz ".?AVClass1@@" # LinkageName + .byte 242 + .byte 241 + # StringId (0x1008) + .short 0x12 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/tmp/./b.h" # StringData + .byte 241 + # UdtSourceLine (0x1009) + .short 0xe # Record length + .short 0x1606 # Record kind: LF_UDT_SRC_LINE + .long 0x1007 # UDT: Class1 + .long 0x1008 # SourceFile: /tmp/./b.h + .long 0x2 # LineNumber + # MemberFuncId (0x100A) + .short 0xe # Record length + .short 0x1602 # Record kind: LF_MFUNC_ID + .long 0x1004 # ClassType: Class1 + .long 0x1005 # FunctionType: int Class1::(int) + .asciz "bar" # Name + # Class (0x100B) + .short 0x42 # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x0 # MemberCount + .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) + .long 0x0 # FieldList + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x0 # SizeOf + .asciz "Namespace2::Class2" # Name + .asciz ".?AVClass2@Namespace2@@" # LinkageName + .byte 243 + .byte 242 + .byte 241 + # MemberFunction (0x100C) + .short 0x1a # Record length + .short 0x1009 # Record kind: LF_MFUNCTION + .long 0x74 # ReturnType: int + .long 0x100b # ClassType: Namespace2::Class2 + .long 0x0 # ThisType + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x1 # NumParameters + .long 0x1001 # ArgListType: (int) + .long 0x0 # ThisAdjustment + # FieldList (0x100D) + .short 0x12 # Record length + .short 0x1203 # Record kind: LF_FIELDLIST + .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) + .short 0xb # Attrs: Public, Static + .long 0x100c # Type: int Namespace2::Class2::(int) + .asciz "func" # Name + .byte 243 + .byte 242 + .byte 241 + # Class (0x100E) + .short 0x42 # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x1 # MemberCount + .short 0x200 # Properties ( HasUniqueName (0x200) ) + .long 0x100d # FieldList: + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x1 # SizeOf + .asciz "Namespace2::Class2" # Name + .asciz ".?AVClass2@Namespace2@@" # LinkageName + .byte 243 + .byte 242 + .byte 241 + # StringId (0x100F) + .short 0x12 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/tmp/./c.h" # StringData + .byte 241 + # UdtSourceLine (0x1010) + .short 0xe # Record length + .short 0x1606 # Record kind: LF_UDT_SRC_LINE + .long 0x100e # UDT: Namespace2::Class2 + .long 0x100f # SourceFile: /tmp/./c.h + .long 0x2 # LineNumber + # MemberFuncId (0x1011) + .short 0x12 # Record length + .short 0x1602 # Record kind: LF_MFUNC_ID + .long 0x100b # ClassType: Namespace2::Class2 + .long 0x100c # FunctionType: int Namespace2::Class2::(int) + .asciz "func" # Name + .byte 243 + .byte 242 + .byte 241 + # Pointer (0x1012) + .short 0xa # Record length + .short 0x1002 # Record kind: LF_POINTER + .long 0x670 # PointeeType: char* + .long 0x1000c # Attrs: [ Type: Near64, Mode: Pointer, SizeOf: 8 ] + # ArgList (0x1013) + .short 0xe # Record length + .short 0x1201 # Record kind: LF_ARGLIST + .long 0x2 # NumArgs + .long 0x74 # Argument: int + .long 0x1012 # Argument: char** + # Procedure (0x1014) + .short 0xe # Record length + .short 0x1008 # Record kind: LF_PROCEDURE + .long 0x74 # ReturnType: int + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x2 # NumParameters + .long 0x1013 # ArgListType: (int, char**) + # FuncId (0x1015) + .short 0x12 # Record length + .short 0x1601 # Record kind: LF_FUNC_ID + .long 0x0 # ParentScope + .long 0x1014 # FunctionType: int (int, char**) + .asciz "main" # Name + .byte 243 + .byte 242 + .byte 241 + # Modifier (0x1016) + .short 0xa # Record length + .short 0x1001 # Record kind: LF_MODIFIER + .long 0x74 # ModifiedType: int + .short 0x2 # Modifiers ( Volatile (0x2) ) + .byte 242 + .byte 241 + # StringId (0x1017) + .short 0xe # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/tmp" # StringData + .byte 243 + .byte 242 + .byte 241 + # StringId (0x1018) + .short 0xe # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "a.cpp" # StringData + .byte 242 + .byte 241 + # StringId (0x1019) + .short 0xa # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .byte 0 # StringData + .byte 243 + .byte 242 + .byte 241 + # StringId (0x101A) + .short 0x4e # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang" # StringData + .byte 243 + .byte 242 + .byte 241 + # StringId (0x101B) + .short 0x9f6 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "\"-cc1\" \"-triple\" \"x86_64-pc-windows-msvc19.20.0\" \"-S\" \"-disable-free\" \"-clear-ast-before-backend\" \"-disable-llvm-verifier\" \"-discard-value-names\" \"-mrelocation-model\" \"pic\" \"-pic-level\" \"2\" \"-mframe-pointer=none\" \"-relaxed-aliasing\" \"-fmath-errno\" \"-ffp-contract=on\" \"-fno-rounding-math\" \"-mconstructor-aliases\" \"-funwind-tables=2\" \"-target-cpu\" \"x86-64\" \"-mllvm\" \"-x86-asm-syntax=intel\" \"-tune-cpu\" \"generic\" \"-mllvm\" \"-treat-scalable-fixed-error-as-warning\" \"-D_MT\" \"-flto-visibility-public-std\" \"--dependent-lib=libcmt\" \"--dependent-lib=oldnames\" \"-stack-protector\" \"2\" \"-fms-volatile\" \"-fdiagnostics-format\" \"msvc\" \"-gno-column-info\" \"-gcodeview\" \"-debug-info-kind=constructor\" \"-ffunction-sections\" \"-fcoverage-compilation-dir=/tmp\" \"-resource-dir\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt\" \"-Os\" \"-fdeprecated-macro\" \"-fdebug-compilation-dir=/tmp\" \"-ferror-limit\" \"19\" \"-fno-use-cxa-atexit\" \"-fms-extensions\" \"-fms-compatibility\" \"-fms-compatibility-version=19.20\" \"-std=c++14\" \"-fdelayed-template-parsing\" \"-fcolor-diagnostics\" \"-vectorize-loops\" \"-vectorize-slp\" \"-faddrsig\" \"-x\" \"c++\"" # StringData + .byte 242 + .byte 241 + # BuildInfo (0x101C) + .short 0x1a # Record length + .short 0x1603 # Record kind: LF_BUILDINFO + .short 0x5 # NumArgs + .long 0x1017 # Argument: /tmp + .long 0x101a # Argument: /usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang + .long 0x1018 # Argument: a.cpp + .long 0x1019 # Argument + .long 0x101b # Argument: "-cc1" "-triple" "x86_64-pc-windows-msvc19.20.0" "-S" "-disable-free" "-clear-ast-before-backend" "-disable-llvm-verifier" "-discard-value-names" "-mrelocation-model" "pic" "-pic-level" "2" "-mframe-pointer=none" "-relaxed-aliasing" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-mllvm" "-x86-asm-syntax=intel" "-tune-cpu" "generic" "-mllvm" "-treat-scalable-fixed-error-as-warning" "-D_MT" "-flto-visibility-public-std" "--dependent-lib=libcmt" "--dependent-lib=oldnames" "-stack-protector" "2" "-fms-volatile" "-fdiagnostics-format" "msvc" "-gno-column-info" "-gcodeview" "-debug-info-kind=constructor" "-ffunction-sections" "-fcoverage-compilation-dir=/tmp" "-resource-dir" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0" "-internal-isystem" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt" "-Os" "-fdeprecated-macro" "-fdebug-compilation-dir=/tmp" "-ferror-limit" "19" "-fno-use-cxa-atexit" "-fms-extensions" "-fms-compatibility" "-fms-compatibility-version=19.20" "-std=c++14" "-fdelayed-template-parsing" "-fcolor-diagnostics" "-vectorize-loops" "-vectorize-slp" "-faddrsig" "-x" "c++" + .byte 242 + .byte 241 + .addrsig diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit index 2291c7c4527175c867b3c6298f10b591a1f1633b..eab5061dafbdcda19b364ff0cb82df858ad31c38 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit @@ -1,7 +1,7 @@ -br set -p BP_bar -f inline_sites_live.cpp -br set -p BP_foo -f inline_sites_live.cpp -run -expression param -continue -expression param -expression local +br set -p BP_bar -f inline_sites_live.cpp +br set -p BP_foo -f inline_sites_live.cpp +run +expression param +continue +expression param +expression local diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit index ad080da24dab71d0a673c8b9f0f50c4d154f73c1..feda74856757920085aef6385117332b9d9e0f07 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit @@ -1,35 +1,35 @@ -image lookup -a 0x140001000 -v -image lookup -a 0x140001003 -v -image lookup -a 0x140001006 -v - -image lookup -a 0x140001011 -v -image lookup -a 0x140001017 -v -image lookup -a 0x140001019 -v -image lookup -a 0x14000101e -v -image lookup -a 0x14000102c -v - -image lookup -a 0x140001031 -v -image lookup -a 0x140001032 -v -image lookup -a 0x140001033 -v -image lookup -a 0x140001034 -v -image lookup -a 0x140001035 -v -image lookup -a 0x140001036 -v -image lookup -a 0x140001037 -v -image lookup -a 0x14000103b -v -image lookup -a 0x14000103d -v -image lookup -a 0x14000103f -v -image lookup -a 0x140001041 -v -image lookup -a 0x140001043 -v -image lookup -a 0x140001045 -v -image lookup -a 0x140001046 -v -image lookup -a 0x140001047 -v -image lookup -a 0x140001048 -v -image lookup -a 0x140001049 -v -image lookup -a 0x14000104a -v -image lookup -a 0x14000104b -v -image lookup -a 0x14000104c -v -image lookup -a 0x14000104e -v -image lookup -a 0x14000104f -v -image lookup -a 0x140001050 -v -image lookup -a 0x140001051 -v -exit +image lookup -a 0x140001000 -v +image lookup -a 0x140001003 -v +image lookup -a 0x140001006 -v + +image lookup -a 0x140001011 -v +image lookup -a 0x140001017 -v +image lookup -a 0x140001019 -v +image lookup -a 0x14000101e -v +image lookup -a 0x14000102c -v + +image lookup -a 0x140001031 -v +image lookup -a 0x140001032 -v +image lookup -a 0x140001033 -v +image lookup -a 0x140001034 -v +image lookup -a 0x140001035 -v +image lookup -a 0x140001036 -v +image lookup -a 0x140001037 -v +image lookup -a 0x14000103b -v +image lookup -a 0x14000103d -v +image lookup -a 0x14000103f -v +image lookup -a 0x140001041 -v +image lookup -a 0x140001043 -v +image lookup -a 0x140001045 -v +image lookup -a 0x140001046 -v +image lookup -a 0x140001047 -v +image lookup -a 0x140001048 -v +image lookup -a 0x140001049 -v +image lookup -a 0x14000104a -v +image lookup -a 0x14000104b -v +image lookup -a 0x14000104c -v +image lookup -a 0x14000104e -v +image lookup -a 0x14000104f -v +image lookup -a 0x140001050 -v +image lookup -a 0x140001051 -v +exit diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit index afe3f2c8b943e398897353ebd513887ff2b4f831..3f639eb2e539bc93f43f7e248e9c318a31d8d433 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit @@ -1,4 +1,4 @@ -image lookup -type A -image lookup -type B - +image lookup -type A +image lookup -type B + quit \ No newline at end of file diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit index 3dc33fd789dac0e86fc2a876214e9acf6d74a89e..32758f1fbc51f396ffa3c92f51b0c1e41d754e8d 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit @@ -1,2 +1,2 @@ -image lookup -a 0x40102f -v -quit +image lookup -a 0x40102f -v +quit diff --git a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp index ca2a84de7698a4c41cb098ff4265485ace86c811..f0fac90e5065a129ace391fb46e03bf1c6456834 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp @@ -113,9 +113,9 @@ auto incomplete = &three; // CHECK: |-CXXRecordDecl {{.*}} union U // CHECK: |-EnumDecl {{.*}} E // CHECK: |-CXXRecordDecl {{.*}} struct S -// CHECK: |-VarDecl {{.*}} a 'S (*)(C *, U &, E &&)' -// CHECK: |-VarDecl {{.*}} b 'E (*)(const S *, const C &, const U &&)' -// CHECK: |-VarDecl {{.*}} c 'U (*)(volatile E *, volatile S &, volatile C &&)' +// CHECK: |-VarDecl {{.*}} a 'S (*)(C *, U &, E &&)' +// CHECK: |-VarDecl {{.*}} b 'E (*)(const S *, const C &, const U &&)' +// CHECK: |-VarDecl {{.*}} c 'U (*)(volatile E *, volatile S &, volatile C &&)' // CHECK: |-VarDecl {{.*}} d 'C (*)(const volatile U *, const volatile E &, const volatile S &&)' // CHECK: |-CXXRecordDecl {{.*}} struct B // CHECK: | `-CXXRecordDecl {{.*}} struct A @@ -125,14 +125,14 @@ auto incomplete = &three; // CHECK: | | `-CXXRecordDecl {{.*}} struct S // CHECK: | `-NamespaceDecl {{.*}} B // CHECK: | `-CXXRecordDecl {{.*}} struct S -// CHECK: |-VarDecl {{.*}} e 'A::B::S *(*)(B::A::S *, A::C::S &)' -// CHECK: |-VarDecl {{.*}} f 'A::C::S &(*)(A::B::S *, B::A::S *)' +// CHECK: |-VarDecl {{.*}} e 'A::B::S *(*)(B::A::S *, A::C::S &)' +// CHECK: |-VarDecl {{.*}} f 'A::C::S &(*)(A::B::S *, B::A::S *)' // CHECK: |-VarDecl {{.*}} g 'B::A::S *(*)(A::C::S &, A::B::S *)' // CHECK: |-CXXRecordDecl {{.*}} struct TC // CHECK: |-CXXRecordDecl {{.*}} struct TC> // CHECK: |-CXXRecordDecl {{.*}} struct TC // CHECK: |-CXXRecordDecl {{.*}} struct TC -// CHECK: |-VarDecl {{.*}} h 'TC (*)(TC, TC>, TC)' +// CHECK: |-VarDecl {{.*}} h 'TC (*)(TC, TC>, TC)' // CHECK: |-VarDecl {{.*}} i 'A::B::S (*)()' // CHECK: |-CXXRecordDecl {{.*}} struct Incomplete // CHECK: `-VarDecl {{.*}} incomplete 'Incomplete *(*)(Incomplete **, const Incomplete *)' diff --git a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp index 767149ea18c468f87d46775e1bc7f37bef3c5151..402982726965809181df7c0415796d3d84116597 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp @@ -1,34 +1,34 @@ -// clang-format off -// REQUIRES: system-windows - -// RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ -// RUN: %p/Inputs/inline_sites_live.lldbinit 2>&1 | FileCheck %s - -void use(int) {} - -void __attribute__((always_inline)) bar(int param) { - use(param); // BP_bar -} - -void __attribute__((always_inline)) foo(int param) { - int local = param+1; - bar(local); - use(param); - use(local); // BP_foo -} - -int main(int argc, char** argv) { - foo(argc); -} - -// CHECK: * thread #1, stop reason = breakpoint 1 -// CHECK-NEXT: frame #0: {{.*}}`main [inlined] bar(param=2) -// CHECK: (lldb) expression param -// CHECK-NEXT: (int) $0 = 2 -// CHECK: * thread #1, stop reason = breakpoint 2 -// CHECK-NEXT: frame #0: {{.*}}`main [inlined] foo(param=1) -// CHECK: (lldb) expression param -// CHECK-NEXT: (int) $1 = 1 -// CHECK-NEXT: (lldb) expression local -// CHECK-NEXT: (int) $2 = 2 +// clang-format off +// REQUIRES: system-windows + +// RUN: %build -o %t.exe -- %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %p/Inputs/inline_sites_live.lldbinit 2>&1 | FileCheck %s + +void use(int) {} + +void __attribute__((always_inline)) bar(int param) { + use(param); // BP_bar +} + +void __attribute__((always_inline)) foo(int param) { + int local = param+1; + bar(local); + use(param); + use(local); // BP_foo +} + +int main(int argc, char** argv) { + foo(argc); +} + +// CHECK: * thread #1, stop reason = breakpoint 1 +// CHECK-NEXT: frame #0: {{.*}}`main [inlined] bar(param=2) +// CHECK: (lldb) expression param +// CHECK-NEXT: (int) $0 = 2 +// CHECK: * thread #1, stop reason = breakpoint 2 +// CHECK-NEXT: frame #0: {{.*}}`main [inlined] foo(param=1) +// CHECK: (lldb) expression param +// CHECK-NEXT: (int) $1 = 1 +// CHECK-NEXT: (lldb) expression local +// CHECK-NEXT: (int) $2 = 2 diff --git a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp index f3aea8115f3858c56ba032b9b64940b86e6b5494..cd5bbfc30fa0e16944f0f8222ed6a636d4f81013 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp @@ -1,46 +1,46 @@ -// clang-format off - -// RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ -// RUN: %p/Inputs/lookup-by-types.lldbinit 2>&1 | FileCheck %s - -class B; -class A { -public: - static const A constA; - static A a; - static B b; - int val = 1; -}; -class B { -public: - static A a; - int val = 2; -}; -A varA; -B varB; -const A A::constA = varA; -A A::a = varA; -B A::b = varB; -A B::a = varA; - -int main(int argc, char **argv) { - return varA.val + varB.val; -} - -// CHECK: image lookup -type A -// CHECK-NEXT: 1 match found in {{.*}}.exe -// CHECK-NEXT: compiler_type = "class A { -// CHECK-NEXT: static const A constA; -// CHECK-NEXT: static A a; -// CHECK-NEXT: static B b; -// CHECK-NEXT: public: -// CHECK-NEXT: int val; -// CHECK-NEXT: }" -// CHECK: image lookup -type B -// CHECK-NEXT: 1 match found in {{.*}}.exe -// CHECK-NEXT: compiler_type = "class B { -// CHECK-NEXT: static A a; -// CHECK-NEXT: public: -// CHECK-NEXT: int val; -// CHECK-NEXT: }" +// clang-format off + +// RUN: %build -o %t.exe -- %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %p/Inputs/lookup-by-types.lldbinit 2>&1 | FileCheck %s + +class B; +class A { +public: + static const A constA; + static A a; + static B b; + int val = 1; +}; +class B { +public: + static A a; + int val = 2; +}; +A varA; +B varB; +const A A::constA = varA; +A A::a = varA; +B A::b = varB; +A B::a = varA; + +int main(int argc, char **argv) { + return varA.val + varB.val; +} + +// CHECK: image lookup -type A +// CHECK-NEXT: 1 match found in {{.*}}.exe +// CHECK-NEXT: compiler_type = "class A { +// CHECK-NEXT: static const A constA; +// CHECK-NEXT: static A a; +// CHECK-NEXT: static B b; +// CHECK-NEXT: public: +// CHECK-NEXT: int val; +// CHECK-NEXT: }" +// CHECK: image lookup -type B +// CHECK-NEXT: 1 match found in {{.*}}.exe +// CHECK-NEXT: compiler_type = "class B { +// CHECK-NEXT: static A a; +// CHECK-NEXT: public: +// CHECK-NEXT: int val; +// CHECK-NEXT: }" diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp index f22d626c4af2c6c81ecddf295a3eda092a6bfeab..07211c6e9db492e91e263219d8c8b49d668de9e7 100644 --- a/lldb/tools/debugserver/source/RNBRemote.cpp +++ b/lldb/tools/debugserver/source/RNBRemote.cpp @@ -176,9 +176,6 @@ void append_hexified_string(std::ostream &ostrm, const std::string &string) { } } -extern void ASLLogCallback(void *baton, uint32_t flags, const char *format, - va_list args); - // from System.framework/Versions/B/PrivateHeaders/sys/codesign.h extern "C" { #define CS_OPS_STATUS 0 /* return status */ @@ -1773,8 +1770,6 @@ static std::string get_value(std::string &line) { extern void FileLogCallback(void *baton, uint32_t flags, const char *format, va_list args); -extern void ASLLogCallback(void *baton, uint32_t flags, const char *format, - va_list args); rnb_err_t RNBRemote::HandlePacket_qRcmd(const char *p) { const char *c = p + strlen("qRcmd,"); @@ -1809,8 +1804,8 @@ rnb_err_t RNBRemote::HandlePacket_qRcmd(const char *p) { static_cast(strtoul(value.c_str(), &end, 0)); if (errno == 0 && end && *end == '\0') { DNBLogSetLogMask(logmask); - if (!DNBLogGetLogCallback()) - DNBLogSetLogCallback(ASLLogCallback, NULL); + if (auto log_callback = OsLogger::GetLogFunction()) + DNBLogSetLogCallback(log_callback, nullptr); return SendPacket("OK"); } errno = 0; @@ -2177,13 +2172,8 @@ rnb_err_t set_logging(const char *p) { // Enable DNB logging. // Use the existing log callback if one was already configured. if (!DNBLogGetLogCallback()) { - // Use the os_log()-based logger if available; otherwise, - // fallback to ASL. - auto log_callback = OsLogger::GetLogFunction(); - if (log_callback) + if (auto log_callback = OsLogger::GetLogFunction()) DNBLogSetLogCallback(log_callback, nullptr); - else - DNBLogSetLogCallback(ASLLogCallback, nullptr); } // Update logging to use the configured log channel bitmask. diff --git a/lldb/tools/debugserver/source/libdebugserver.cpp b/lldb/tools/debugserver/source/libdebugserver.cpp index 6da3708b4240b73fa9991b4ed2c149844f41ef90..17a5c137c1afb1b7bc60d3f6d1706d46c161dc93 100644 --- a/lldb/tools/debugserver/source/libdebugserver.cpp +++ b/lldb/tools/debugserver/source/libdebugserver.cpp @@ -311,13 +311,6 @@ RNBRunLoopMode RNBRunLoopInferiorExecuting(RNBRemoteSP &remote) { return mode; } -void ASLLogCallback(void *baton, uint32_t flags, const char *format, - va_list args) { -#if 0 - vprintf(format, args); -#endif -} - extern "C" int debug_server_main(int fd) { #if 1 g_isatty = 0; @@ -327,7 +320,6 @@ extern "C" int debug_server_main(int fd) { DNBLogSetDebug(1); DNBLogSetVerbose(1); DNBLogSetLogMask(-1); - DNBLogSetLogCallback(ASLLogCallback, NULL); #endif signal(SIGPIPE, signal_handler); diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 5e75d84cf8243e70afe2abd8e69906d4e0e32f4a..68559e382006db8113f97d5fab65c820330d6f89 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -866,42 +866,35 @@ int64_t Variables::InsertVariable(lldb::SBValue variable, bool is_permanent) { bool StartDebuggingRequestHandler::DoExecute( lldb::SBDebugger debugger, char **command, lldb::SBCommandReturnObject &result) { - // Command format like: `startDebugging ` + // Command format like: `start-debugging ` if (!command) { - result.SetError("Invalid use of startDebugging"); - result.SetStatus(lldb::eReturnStatusFailed); + result.SetError("Invalid use of start-debugging, expected format " + "`start-debugging `."); return false; } if (!command[0] || llvm::StringRef(command[0]).empty()) { - result.SetError("startDebugging request type missing."); - result.SetStatus(lldb::eReturnStatusFailed); + result.SetError("start-debugging request type missing."); return false; } if (!command[1] || llvm::StringRef(command[1]).empty()) { - result.SetError("configuration missing."); - result.SetStatus(lldb::eReturnStatusFailed); + result.SetError("start-debugging debug configuration missing."); return false; } llvm::StringRef request{command[0]}; std::string raw_configuration{command[1]}; - int i = 2; - while (command[i]) { - raw_configuration.append(" ").append(command[i]); - } - llvm::Expected configuration = llvm::json::parse(raw_configuration); if (!configuration) { llvm::Error err = configuration.takeError(); - std::string msg = - "Failed to parse json configuration: " + llvm::toString(std::move(err)); + std::string msg = "Failed to parse json configuration: " + + llvm::toString(std::move(err)) + "\n\n" + + raw_configuration; result.SetError(msg.c_str()); - result.SetStatus(lldb::eReturnStatusFailed); return false; } @@ -969,6 +962,68 @@ bool ReplModeRequestHandler::DoExecute(lldb::SBDebugger debugger, return true; } +// Sends a DAP event with an optional body. +// +// See +// https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent +bool SendEventRequestHandler::DoExecute(lldb::SBDebugger debugger, + char **command, + lldb::SBCommandReturnObject &result) { + // Command format like: `send-event ?` + if (!command || !command[0] || llvm::StringRef(command[0]).empty()) { + result.SetError("Not enough arguments found, expected format " + "`lldb-dap send-event ?`."); + return false; + } + + llvm::StringRef name{command[0]}; + // Events that are stateful and should be handled by lldb-dap internally. + const std::array internal_events{"breakpoint", "capabilities", "continued", + "exited", "initialize", "loadedSource", + "module", "process", "stopped", + "terminated", "thread"}; + if (std::find(internal_events.begin(), internal_events.end(), name) != + std::end(internal_events)) { + std::string msg = + llvm::formatv("Invalid use of lldb-dap send-event, event \"{0}\" " + "should be handled by lldb-dap internally.", + name) + .str(); + result.SetError(msg.c_str()); + return false; + } + + llvm::json::Object event(CreateEventObject(name)); + + if (command[1] && !llvm::StringRef(command[1]).empty()) { + // See if we have unused arguments. + if (command[2]) { + result.SetError( + "Additional arguments found, expected `lldb-dap send-event " + " ?`."); + return false; + } + + llvm::StringRef raw_body{command[1]}; + + llvm::Expected body = llvm::json::parse(raw_body); + + if (!body) { + llvm::Error err = body.takeError(); + std::string msg = "Failed to parse custom event body: " + + llvm::toString(std::move(err)); + result.SetError(msg.c_str()); + return false; + } + + event.try_emplace("body", std::move(*body)); + } + + g_dap.SendJSON(llvm::json::Value(std::move(event))); + result.SetStatus(lldb::eReturnStatusSuccessFinishNoResult); + return true; +} + void DAP::SetFrameFormat(llvm::StringRef format) { if (format.empty()) return; diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index ba6d3d80410e3d084e12c18c43a78e9c4e4b41f1..acc10ade75fd1470a755c05423b72820a33f2d1a 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -144,6 +144,11 @@ struct ReplModeRequestHandler : public lldb::SBCommandPluginInterface { lldb::SBCommandReturnObject &result) override; }; +struct SendEventRequestHandler : public lldb::SBCommandPluginInterface { + bool DoExecute(lldb::SBDebugger debugger, char **command, + lldb::SBCommandReturnObject &result) override; +}; + struct DAP { std::string debug_adaptor_path; InputStream input; diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md index 3a7d82e887cca3f6a6b0514541e3f3700ecaa5b0..42b5f501e32c65bd4410863b885e0ef2f5a29367 100644 --- a/lldb/tools/lldb-dap/README.md +++ b/lldb/tools/lldb-dap/README.md @@ -244,9 +244,9 @@ The escape character can be adjusted via the `commandEscapePrefix` configuration The `lldb-dap` tool includes additional custom commands to support the Debug Adapter Protocol features. -#### `lldb-dap startDebugging` +#### `lldb-dap start-debugging` -Using the command `lldb-dap startDebugging` it is possible to trigger a +Using the command `lldb-dap start-debugging` it is possible to trigger a reverse request to the client requesting a child debug session with the specified configuration. For example, this can be used to attached to forked or spawned processes. For more information see @@ -255,7 +255,7 @@ spawned processes. For more information see The custom command has the following format: ``` -lldb-dap startDebugging +lldb-dap start-debugging ``` This will launch a server and then request a child debug session for a client. @@ -264,7 +264,7 @@ This will launch a server and then request a child debug session for a client. { "program": "server", "postRunCommand": [ - "lldb-dap startDebugging launch '{\"program\":\"client\"}'" + "lldb-dap start-debugging launch '{\"program\":\"client\"}'" ] } ``` @@ -290,6 +290,37 @@ The initial repl-mode can be configured with the cli flag `--repl-mode=` and may also be adjusted at runtime using the lldb command `lldb-dap repl-mode `. +#### `lldb-dap send-event` + +lldb-dap includes a command to trigger a Debug Adapter Protocol event +from a script. + +The event maybe a custom DAP event or a standard event, if the event is not +handled internally by `lldb-dap`. + +This command has the format: + +``` +lldb-dap send-event ? +``` + +For example you can use a launch configuration hook to trigger custom events like: + +```json +{ + "program": "exe", + "stopCommands": [ + "lldb-dap send-event MyStopEvent", + "lldb-dap send-event MyStopEvent '{\"key\": 321}", + ] +} +``` + +[See the specification](https://microsoft.github.io/debug-adapter-protocol/specification#Base_Protocol_Event) +for more details on Debug Adapter Protocol events and the VS Code +[debug.onDidReceiveDebugSessionCustomEvent](https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent) +API for handling a custom event from an extension. + ## Contributing `lldb-dap` and `lldb` are developed under the umbrella of the [LLVM project](https://llvm.org/). diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index a167088c8901cad4f7d4bca6a11fae5583e51d8f..f70b0d3d4cbee07bae9487a5c0f57d084574c8e1 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -1889,13 +1889,15 @@ void request_initialize(const llvm::json::Object &request) { "lldb-dap", "Commands for managing lldb-dap."); if (GetBoolean(arguments, "supportsStartDebuggingRequest", false)) { cmd.AddCommand( - "startDebugging", new StartDebuggingRequestHandler(), + "start-debugging", new StartDebuggingRequestHandler(), "Sends a startDebugging request from the debug adapter to the client " "to start a child debug session of the same type as the caller."); } cmd.AddCommand( "repl-mode", new ReplModeRequestHandler(), "Get or set the repl behavior of lldb-dap evaluation requests."); + cmd.AddCommand("send-event", new SendEventRequestHandler(), + "Sends an DAP event to the client."); g_dap.progress_event_thread = std::thread(ProgressEventThreadFunction); diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp index 2ef780578d0a28c9dfb5c82c0b974f025f7edccc..735a558810da582ad02420b384d60026a6e935c3 100644 --- a/lldb/tools/lldb-server/lldb-platform.cpp +++ b/lldb/tools/lldb-server/lldb-platform.cpp @@ -260,8 +260,7 @@ static void client_handle(GDBRemoteCommunicationServerPlatform &platform, static Status spawn_process(const char *progname, const Socket *conn_socket, uint16_t gdb_port, const lldb_private::Args &args, const std::string &log_file, - const StringRef log_channels, MainLoop &main_loop, - std::promise &child_exited) { + const StringRef log_channels, MainLoop &main_loop) { Status error; SharedSocket shared_socket(conn_socket, error); if (error.Fail()) @@ -301,12 +300,10 @@ static Status spawn_process(const char *progname, const Socket *conn_socket, if (g_server) launch_info.SetMonitorProcessCallback([](lldb::pid_t, int, int) {}); else - launch_info.SetMonitorProcessCallback( - [&child_exited, &main_loop](lldb::pid_t, int, int) { - main_loop.AddPendingCallback( - [](MainLoopBase &loop) { loop.RequestTermination(); }); - child_exited.set_value(); - }); + launch_info.SetMonitorProcessCallback([&main_loop](lldb::pid_t, int, int) { + main_loop.AddPendingCallback( + [](MainLoopBase &loop) { loop.RequestTermination(); }); + }); // Copy the current environment. launch_info.GetEnvironment() = Host::GetEnvironment(); @@ -550,27 +547,24 @@ int main_platform(int argc, char *argv[]) { return socket_error; } - std::promise child_exited; MainLoop main_loop; { llvm::Expected> platform_handles = platform_sock->Accept( main_loop, [progname, gdbserver_port, &inferior_arguments, log_file, - log_channels, &main_loop, &child_exited, + log_channels, &main_loop, &platform_handles](std::unique_ptr sock_up) { printf("Connection established.\n"); - Status error = spawn_process( - progname, sock_up.get(), gdbserver_port, inferior_arguments, - log_file, log_channels, main_loop, child_exited); + Status error = spawn_process(progname, sock_up.get(), + gdbserver_port, inferior_arguments, + log_file, log_channels, main_loop); if (error.Fail()) { Log *log = GetLog(LLDBLog::Platform); LLDB_LOGF(log, "spawn_process failed: %s", error.AsCString()); WithColor::error() << "spawn_process failed: " << error.AsCString() << "\n"; - if (!g_server) { + if (!g_server) main_loop.RequestTermination(); - child_exited.set_value(); - } } if (!g_server) platform_handles->clear(); @@ -592,7 +586,6 @@ int main_platform(int argc, char *argv[]) { main_loop.Run(); } - child_exited.get_future().get(); fprintf(stderr, "lldb-server exiting...\n"); diff --git a/lldb/unittests/Breakpoint/CMakeLists.txt b/lldb/unittests/Breakpoint/CMakeLists.txt index 757c2da1a4d9dee85343d098c8699e36d4bd14cd..db985bc82dc5e23b5e1c23c221848284f01cbe27 100644 --- a/lldb/unittests/Breakpoint/CMakeLists.txt +++ b/lldb/unittests/Breakpoint/CMakeLists.txt @@ -1,10 +1,10 @@ -add_lldb_unittest(LLDBBreakpointTests - BreakpointIDTest.cpp - WatchpointAlgorithmsTests.cpp - - LINK_LIBS - lldbBreakpoint - lldbCore - LINK_COMPONENTS - Support - ) +add_lldb_unittest(LLDBBreakpointTests + BreakpointIDTest.cpp + WatchpointAlgorithmsTests.cpp + + LINK_LIBS + lldbBreakpoint + lldbCore + LINK_COMPONENTS + Support + ) diff --git a/lldb/unittests/Host/MainLoopTest.cpp b/lldb/unittests/Host/MainLoopTest.cpp index b8417c9f00aa86bcae8e5d48412fbcbc813388e9..4688d4fed475b61a1fbb10df4e6c1405b9f7c045 100644 --- a/lldb/unittests/Host/MainLoopTest.cpp +++ b/lldb/unittests/Host/MainLoopTest.cpp @@ -194,9 +194,6 @@ TEST_F(MainLoopTest, PendingCallbackTrigger) { add_callback2.set_value(); }); Status error; - auto socket_handle = loop.RegisterReadObject( - socketpair[1], [](MainLoopBase &) {}, error); - ASSERT_TRUE(socket_handle); ASSERT_THAT_ERROR(error.ToError(), llvm::Succeeded()); bool callback2_called = false; std::thread callback2_adder([&]() { @@ -212,15 +209,18 @@ TEST_F(MainLoopTest, PendingCallbackTrigger) { ASSERT_TRUE(callback2_called); } -// Regression test for assertion failure if a lot of callbacks end up -// being queued after loop exits. -TEST_F(MainLoopTest, PendingCallbackAfterLoopExited) { +TEST_F(MainLoopTest, ManyPendingCallbacks) { MainLoop loop; Status error; - ASSERT_TRUE(loop.Run().Success()); - // Try to fill the pipe buffer in. + // Try to fill up the pipe buffer and make sure bad things don't happen. This + // is a regression test for the case where writing to the interrupt pipe + // caused a deadlock when the pipe filled up (either because the main loop was + // not running, because it was slow, or because it was busy/blocked doing + // something else). for (int i = 0; i < 65536; ++i) - loop.AddPendingCallback([&](MainLoopBase &loop) {}); + loop.AddPendingCallback( + [&](MainLoopBase &loop) { loop.RequestTermination(); }); + ASSERT_TRUE(loop.Run().Success()); } #ifdef LLVM_ON_UNIX diff --git a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp index 2bd80796b8074c21a5a41c6e0ae65332b9cb0105..39d8b1d558420d30e834cf5b0f44599884668a45 100644 --- a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp +++ b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp @@ -16,12 +16,63 @@ std::string Render(std::vector details) { } // namespace TEST_F(ErrorDisplayTest, RenderStatus) { - DiagnosticDetail::SourceLocation inline_loc; - inline_loc.in_user_input = true; + using SourceLocation = DiagnosticDetail::SourceLocation; { + SourceLocation inline_loc; + inline_loc.in_user_input = true; std::string result = Render({DiagnosticDetail{inline_loc, eSeverityError, "foo", ""}}); ASSERT_TRUE(StringRef(result).contains("error:")); ASSERT_TRUE(StringRef(result).contains("foo")); } + + { + // Test that diagnostics on the same column can be handled and all + // three errors are diagnosed. + SourceLocation loc1 = {FileSpec{"a.c"}, 13, 11, 0, false, true}; + SourceLocation loc2 = {FileSpec{"a.c"}, 13, 13, 0, false, true}; + std::string result = + Render({DiagnosticDetail{loc1, eSeverityError, "1", "1"}, + DiagnosticDetail{loc1, eSeverityError, "2", "2"}, + DiagnosticDetail{loc2, eSeverityError, "3", "3"}}); + ASSERT_TRUE(StringRef(result).contains("error: 1")); + ASSERT_TRUE(StringRef(result).contains("error: 2")); + ASSERT_TRUE(StringRef(result).contains("error: 3")); + } + { + // Test that diagnostics in reverse order are emitted correctly. + SourceLocation loc1 = {FileSpec{"a.c"}, 1, 20, 0, false, true}; + SourceLocation loc2 = {FileSpec{"a.c"}, 2, 10, 0, false, true}; + std::string result = + Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"}, + DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}}); + ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X")); + } + { + // Test that diagnostics in reverse order are emitted correctly. + SourceLocation loc1 = {FileSpec{"a.c"}, 2, 10, 0, false, true}; + SourceLocation loc2 = {FileSpec{"a.c"}, 1, 20, 0, false, true}; + std::string result = + Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"}, + DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}}); + ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X")); + } + { + // Test that range diagnostics are emitted correctly. + SourceLocation loc1 = {FileSpec{"a.c"}, 1, 1, 3, false, true}; + SourceLocation loc2 = {FileSpec{"a.c"}, 1, 5, 3, false, true}; + std::string result = + Render({DiagnosticDetail{loc1, eSeverityError, "X", "X"}, + DiagnosticDetail{loc2, eSeverityError, "Y", "Y"}}); + auto lines = StringRef(result).split('\n'); + auto line1 = lines.first; + lines = lines.second.split('\n'); + auto line2 = lines.first; + lines = lines.second.split('\n'); + auto line3 = lines.first; + // 1234567 + ASSERT_EQ(line1, "^~~ ^~~"); + ASSERT_EQ(line2, "| error: Y"); + ASSERT_EQ(line3, "error: X"); + } } diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 7f35b572fe35659f6bc91075f3bdbc8d6ee6beb6..0cd5ba240db1ee26e79ba60e89e93c15273b3885 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -117,6 +117,12 @@ endif() # This allows an easy way of setting up a build directory for llvm and another # one for llvm+clang+... using the same sources. set(LLVM_ALL_PROJECTS "bolt;clang;clang-tools-extra;compiler-rt;cross-project-tests;libc;libclc;lld;lldb;mlir;openmp;polly;pstl") +if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + # Disallow 'openmp' as a LLVM PROJECT on AIX as the supported way is to use + # LLVM_ENABLE_RUNTIMES. + list(REMOVE_ITEM LLVM_ALL_PROJECTS openmp) +endif() + # The flang project is not yet part of "all" projects (see C++ requirements) set(LLVM_EXTRA_PROJECTS "flang") # List of all known projects in the mono repo diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp index c03ead400d0d5c88aa948061fb0c0c6d54d536e1..e351db338730e9859e9ee5a3bb0e429041f0b98f 100644 --- a/llvm/benchmarks/FormatVariadicBM.cpp +++ b/llvm/benchmarks/FormatVariadicBM.cpp @@ -1,63 +1,63 @@ -//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "benchmark/benchmark.h" -#include "llvm/Support/FormatVariadic.h" -#include -#include -#include - -using namespace llvm; -using namespace std; - -// Generate a list of format strings that have `NumReplacements` replacements -// by permuting the replacements and some literal text. -static vector getFormatStrings(int NumReplacements) { - vector Components; - for (int I = 0; I < NumReplacements; I++) - Components.push_back("{" + to_string(I) + "}"); - // Intersperse these with some other literal text (_). - const string_view Literal = "____"; - for (char C : Literal) - Components.push_back(string(1, C)); - - vector Formats; - do { - string Concat; - for (const string &C : Components) - Concat += C; - Formats.emplace_back(Concat); - } while (next_permutation(Components.begin(), Components.end())); - return Formats; -} - -// Generate the set of formats to exercise outside the benchmark code. -static const vector> Formats = { - getFormatStrings(1), getFormatStrings(2), getFormatStrings(3), - getFormatStrings(4), getFormatStrings(5), -}; - -// Benchmark formatv() for a variety of format strings and 1-5 replacements. -static void BM_FormatVariadic(benchmark::State &state) { - for (auto _ : state) { - for (const string &Fmt : Formats[0]) - formatv(Fmt.c_str(), 1).str(); - for (const string &Fmt : Formats[1]) - formatv(Fmt.c_str(), 1, 2).str(); - for (const string &Fmt : Formats[2]) - formatv(Fmt.c_str(), 1, 2, 3).str(); - for (const string &Fmt : Formats[3]) - formatv(Fmt.c_str(), 1, 2, 3, 4).str(); - for (const string &Fmt : Formats[4]) - formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str(); - } -} - -BENCHMARK(BM_FormatVariadic); - -BENCHMARK_MAIN(); +//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include +#include + +using namespace llvm; +using namespace std; + +// Generate a list of format strings that have `NumReplacements` replacements +// by permuting the replacements and some literal text. +static vector getFormatStrings(int NumReplacements) { + vector Components; + for (int I = 0; I < NumReplacements; I++) + Components.push_back("{" + to_string(I) + "}"); + // Intersperse these with some other literal text (_). + const string_view Literal = "____"; + for (char C : Literal) + Components.push_back(string(1, C)); + + vector Formats; + do { + string Concat; + for (const string &C : Components) + Concat += C; + Formats.emplace_back(Concat); + } while (next_permutation(Components.begin(), Components.end())); + return Formats; +} + +// Generate the set of formats to exercise outside the benchmark code. +static const vector> Formats = { + getFormatStrings(1), getFormatStrings(2), getFormatStrings(3), + getFormatStrings(4), getFormatStrings(5), +}; + +// Benchmark formatv() for a variety of format strings and 1-5 replacements. +static void BM_FormatVariadic(benchmark::State &state) { + for (auto _ : state) { + for (const string &Fmt : Formats[0]) + formatv(Fmt.c_str(), 1).str(); + for (const string &Fmt : Formats[1]) + formatv(Fmt.c_str(), 1, 2).str(); + for (const string &Fmt : Formats[2]) + formatv(Fmt.c_str(), 1, 2, 3).str(); + for (const string &Fmt : Formats[3]) + formatv(Fmt.c_str(), 1, 2, 3, 4).str(); + for (const string &Fmt : Formats[4]) + formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str(); + } +} + +BENCHMARK(BM_FormatVariadic); + +BENCHMARK_MAIN(); diff --git a/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp b/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp index fa9c528424c95f7c5adc4404b5134331cfff3a85..953d9125e11ee25515ed4d1a829ab30fecdf5d92 100644 --- a/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp +++ b/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp @@ -1,50 +1,50 @@ -#include "benchmark/benchmark.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace Intrinsic; - -// Benchmark intrinsic lookup from a variety of targets. -static void BM_GetIntrinsicForClangBuiltin(benchmark::State &state) { - static const char *Builtins[] = { - "__builtin_adjust_trampoline", - "__builtin_trap", - "__builtin_arm_ttest", - "__builtin_amdgcn_cubetc", - "__builtin_amdgcn_udot2", - "__builtin_arm_stc", - "__builtin_bpf_compare", - "__builtin_HEXAGON_A2_max", - "__builtin_lasx_xvabsd_b", - "__builtin_mips_dlsa", - "__nvvm_floor_f", - "__builtin_altivec_vslb", - "__builtin_r600_read_tgid_x", - "__builtin_riscv_aes64im", - "__builtin_s390_vcksm", - "__builtin_ve_vl_pvfmksge_Mvl", - "__builtin_ia32_axor64", - "__builtin_bitrev", - }; - static const char *Targets[] = {"", "aarch64", "amdgcn", "mips", - "nvvm", "r600", "riscv"}; - - for (auto _ : state) { - for (auto Builtin : Builtins) - for (auto Target : Targets) - getIntrinsicForClangBuiltin(Target, Builtin); - } -} - -static void -BM_GetIntrinsicForClangBuiltinHexagonFirst(benchmark::State &state) { - // Exercise the worst case by looking for the first builtin for a target - // that has a lot of builtins. - for (auto _ : state) - getIntrinsicForClangBuiltin("hexagon", "__builtin_HEXAGON_A2_abs"); -} - -BENCHMARK(BM_GetIntrinsicForClangBuiltin); -BENCHMARK(BM_GetIntrinsicForClangBuiltinHexagonFirst); - -BENCHMARK_MAIN(); +#include "benchmark/benchmark.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; +using namespace Intrinsic; + +// Benchmark intrinsic lookup from a variety of targets. +static void BM_GetIntrinsicForClangBuiltin(benchmark::State &state) { + static const char *Builtins[] = { + "__builtin_adjust_trampoline", + "__builtin_trap", + "__builtin_arm_ttest", + "__builtin_amdgcn_cubetc", + "__builtin_amdgcn_udot2", + "__builtin_arm_stc", + "__builtin_bpf_compare", + "__builtin_HEXAGON_A2_max", + "__builtin_lasx_xvabsd_b", + "__builtin_mips_dlsa", + "__nvvm_floor_f", + "__builtin_altivec_vslb", + "__builtin_r600_read_tgid_x", + "__builtin_riscv_aes64im", + "__builtin_s390_vcksm", + "__builtin_ve_vl_pvfmksge_Mvl", + "__builtin_ia32_axor64", + "__builtin_bitrev", + }; + static const char *Targets[] = {"", "aarch64", "amdgcn", "mips", + "nvvm", "r600", "riscv"}; + + for (auto _ : state) { + for (auto Builtin : Builtins) + for (auto Target : Targets) + getIntrinsicForClangBuiltin(Target, Builtin); + } +} + +static void +BM_GetIntrinsicForClangBuiltinHexagonFirst(benchmark::State &state) { + // Exercise the worst case by looking for the first builtin for a target + // that has a lot of builtins. + for (auto _ : state) + getIntrinsicForClangBuiltin("hexagon", "__builtin_HEXAGON_A2_abs"); +} + +BENCHMARK(BM_GetIntrinsicForClangBuiltin); +BENCHMARK(BM_GetIntrinsicForClangBuiltinHexagonFirst); + +BENCHMARK_MAIN(); diff --git a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp index 7f3bd3bc9eb6b3df884a7f1239a1a7e6ebef0b29..758291274675d6acb2083c41ffb820272d410f9b 100644 --- a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp +++ b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp @@ -1,30 +1,30 @@ -//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "benchmark/benchmark.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace Intrinsic; - -static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) { - SmallVector Table; - for (auto _ : state) { - for (ID ID = 1; ID < num_intrinsics; ++ID) { - // This makes sure the vector does not keep growing, as well as after the - // first iteration does not result in additional allocations. - Table.clear(); - getIntrinsicInfoTableEntries(ID, Table); - } - } -} - -BENCHMARK(BM_GetIntrinsicInfoTableEntries); - -BENCHMARK_MAIN(); +//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; +using namespace Intrinsic; + +static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) { + SmallVector Table; + for (auto _ : state) { + for (ID ID = 1; ID < num_intrinsics; ++ID) { + // This makes sure the vector does not keep growing, as well as after the + // first iteration does not result in additional allocations. + Table.clear(); + getIntrinsicInfoTableEntries(ID, Table); + } + } +} + +BENCHMARK(BM_GetIntrinsicInfoTableEntries); + +BENCHMARK_MAIN(); diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst index caa4b31b949c9204ed1d1dc2000a6dcb92826718..0ecf1423e6037115fa669a197639f03903d4dc64 100644 --- a/llvm/docs/DeveloperPolicy.rst +++ b/llvm/docs/DeveloperPolicy.rst @@ -105,6 +105,13 @@ When submitting patches, please do not add confidentiality or non-disclosure notices to the patches themselves. These notices conflict with the LLVM licensing terms and may result in your contribution being excluded. +The LLVM project uses email to communicate to contributors outside of the +GitHub platform about their past contributions. Primarily, our buildbot +infrastructure uses emails to contact contributors about build and test +failures. Therefore, the LLVM community requires contributors to have a public +email address associated with their GitHub commits, so please ensure that "Keep +my email addresses private" is disabled in your +`account settings `_. .. _code review: diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index dcdd7a25c7fbeeb621dd53a565d77492fdd6cf83..f8bc7e79239b64bfda19417692a2309d1a28bb28 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -125,6 +125,12 @@ Changes to the ARM Backend the required alignment space with a sequence of `0x0` bytes (the requested fill value) rather than NOPs. +* The default behavior for frame pointers in leaf functions has been updated. + When the `-fno-omit-frame-pointer` option is specified, `FPKeepKindStr` is + set to `-mframe-pointer=all`, meaning the frame pointer (FP) is now retained + in leaf functions by default. To eliminate the frame pointer in leaf functions, + you must explicitly use the `-momit-leaf-frame-pointer` option. + Changes to the AVR Backend -------------------------- diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst index 08617933519fdbca3cf3c4e4ab7da0932d6ded34..344a295226f6ae4d63806119cb75c5547f601858 100644 --- a/llvm/docs/TestingGuide.rst +++ b/llvm/docs/TestingGuide.rst @@ -360,6 +360,12 @@ Best practices for regression tests - Try to give values (including variables, blocks and functions) meaningful names, and avoid retaining complex names generated by the optimization pipeline (such as ``%foo.0.0.0.0.0.0``). +- If your tests depend on specific input file encodings, beware of line-ending + issues across different platforms, and in the project's history. Before you + commit tests that depend on explicit encodings, consider adding filetype or + specific line-ending annotations to a `<.gitattributes + https://git-scm.com/docs/gitattributes#_effects>`_ file in the appropriate + directory in the repository. Extra files ----------- diff --git a/llvm/docs/_static/LoopOptWG_invite.ics b/llvm/docs/_static/LoopOptWG_invite.ics index 65597d90a9c85292110ff5b652eccf4bfb53f027..7c92e4048cc3d1c1a6e5de6c16df5e23d4078177 100644 --- a/llvm/docs/_static/LoopOptWG_invite.ics +++ b/llvm/docs/_static/LoopOptWG_invite.ics @@ -1,80 +1,80 @@ -BEGIN:VCALENDAR -PRODID:-//Google Inc//Google Calendar 70.9054//EN -VERSION:2.0 -CALSCALE:GREGORIAN -METHOD:PUBLISH -X-WR-CALNAME:LLVM Loop Optimization Discussion -X-WR-TIMEZONE:Europe/Berlin -BEGIN:VTIMEZONE -TZID:America/New_York -X-LIC-LOCATION:America/New_York -BEGIN:DAYLIGHT -TZOFFSETFROM:-0500 -TZOFFSETTO:-0400 -TZNAME:EDT -DTSTART:19700308T020000 -RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU -END:DAYLIGHT -BEGIN:STANDARD -TZOFFSETFROM:-0400 -TZOFFSETTO:-0500 -TZNAME:EST -DTSTART:19701101T020000 -RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU -END:STANDARD -END:VTIMEZONE -BEGIN:VEVENT -DTSTART;TZID=America/New_York:20240904T110000 -DTEND;TZID=America/New_York:20240904T120000 -RRULE:FREQ=MONTHLY;BYDAY=1WE -DTSTAMP:20240821T160951Z -UID:58h3f0kd3aooohmeii0johh23c@google.com -X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg -CREATED:20240821T151507Z -DESCRIPTION:LLVM Loop Optimization Discussion
Video call link:
https://meet.google.c - om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB - 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ - :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ - nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) - +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm - z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp - ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n - -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ - :~:~::~:~::- -LAST-MODIFIED:20240821T160941Z -SEQUENCE:0 -STATUS:CONFIRMED -SUMMARY:LLVM Loop Optimization Discussion -TRANSP:OPAQUE -END:VEVENT -BEGIN:VEVENT -DTSTART;TZID=America/New_York:20240904T110000 -DTEND;TZID=America/New_York:20240904T120000 -DTSTAMP:20240821T160951Z -UID:58h3f0kd3aooohmeii0johh23c@google.com -X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg -RECURRENCE-ID;TZID=America/New_York:20240904T110000 -CREATED:20240821T151507Z -DESCRIPTION:LLVM Loop Optimization Discussion
Video call link: https://meet.google.c - om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB - 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ - :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ - nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) - +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm - z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp - ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n - -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ - :~:~::~:~::- -LAST-MODIFIED:20240821T160941Z -SEQUENCE:0 -STATUS:CONFIRMED -SUMMARY:LLVM Loop Optimization Discussion -TRANSP:OPAQUE -END:VEVENT -END:VCALENDAR +BEGIN:VCALENDAR +PRODID:-//Google Inc//Google Calendar 70.9054//EN +VERSION:2.0 +CALSCALE:GREGORIAN +METHOD:PUBLISH +X-WR-CALNAME:LLVM Loop Optimization Discussion +X-WR-TIMEZONE:Europe/Berlin +BEGIN:VTIMEZONE +TZID:America/New_York +X-LIC-LOCATION:America/New_York +BEGIN:DAYLIGHT +TZOFFSETFROM:-0500 +TZOFFSETTO:-0400 +TZNAME:EDT +DTSTART:19700308T020000 +RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU +END:DAYLIGHT +BEGIN:STANDARD +TZOFFSETFROM:-0400 +TZOFFSETTO:-0500 +TZNAME:EST +DTSTART:19701101T020000 +RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU +END:STANDARD +END:VTIMEZONE +BEGIN:VEVENT +DTSTART;TZID=America/New_York:20240904T110000 +DTEND;TZID=America/New_York:20240904T120000 +RRULE:FREQ=MONTHLY;BYDAY=1WE +DTSTAMP:20240821T160951Z +UID:58h3f0kd3aooohmeii0johh23c@google.com +X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg +CREATED:20240821T151507Z +DESCRIPTION:LLVM Loop Optimization Discussion
Video call link: https://meet.google.c + om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB + 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ + :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ + nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) + +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm + z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp + ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n + -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ + :~:~::~:~::- +LAST-MODIFIED:20240821T160941Z +SEQUENCE:0 +STATUS:CONFIRMED +SUMMARY:LLVM Loop Optimization Discussion +TRANSP:OPAQUE +END:VEVENT +BEGIN:VEVENT +DTSTART;TZID=America/New_York:20240904T110000 +DTEND;TZID=America/New_York:20240904T120000 +DTSTAMP:20240821T160951Z +UID:58h3f0kd3aooohmeii0johh23c@google.com +X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg +RECURRENCE-ID;TZID=America/New_York:20240904T110000 +CREATED:20240821T151507Z +DESCRIPTION:LLVM Loop Optimization Discussion
Video call link: https://meet.google.c + om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB + 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ + :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ + nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) + +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm + z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp + ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n + -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ + :~:~::~:~::- +LAST-MODIFIED:20240821T160941Z +SEQUENCE:0 +STATUS:CONFIRMED +SUMMARY:LLVM Loop Optimization Discussion +TRANSP:OPAQUE +END:VEVENT +END:VCALENDAR diff --git a/llvm/include/llvm/ADT/APFixedPoint.h b/llvm/include/llvm/ADT/APFixedPoint.h index e4aa82d7a41c313aaf6f3950778ef34381e580f7..70d7f325702cf525c0c5776712849dc03ca3608f 100644 --- a/llvm/include/llvm/ADT/APFixedPoint.h +++ b/llvm/include/llvm/ADT/APFixedPoint.h @@ -168,7 +168,9 @@ public: } APFixedPoint(uint64_t Val, const FixedPointSemantics &Sema) - : APFixedPoint(APInt(Sema.getWidth(), Val, Sema.isSigned()), Sema) {} + : APFixedPoint(APInt(Sema.getWidth(), Val, Sema.isSigned(), + /*implicitTrunc=*/true), + Sema) {} // Zero initialization. APFixedPoint(const FixedPointSemantics &Sema) : APFixedPoint(0, Sema) {} diff --git a/llvm/include/llvm/BinaryFormat/Minidump.h b/llvm/include/llvm/BinaryFormat/Minidump.h index 8054e81322a92a394668dcad6f1effc57f083246..addff42982352f38fa3e565da12db2f7d3bd7b50 100644 --- a/llvm/include/llvm/BinaryFormat/Minidump.h +++ b/llvm/include/llvm/BinaryFormat/Minidump.h @@ -246,6 +246,8 @@ static_assert(sizeof(Thread) == 48); struct Exception { static constexpr size_t MaxParameters = 15; + static constexpr size_t MaxParameterBytes = MaxParameters * sizeof(uint64_t); + static const uint32_t LLDB_FLAG = 'LLDB'; support::ulittle32_t ExceptionCode; support::ulittle32_t ExceptionFlags; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index d4c0128714ce335f6981cf291e2f823c1ad12cda..a5eb87c2b5f3b69d9a60cca25822eb6a2e709ad9 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2023,6 +2023,9 @@ public: case Intrinsic::atan: ISD = ISD::FATAN; break; + case Intrinsic::atan2: + ISD = ISD::FATAN2; + break; case Intrinsic::sinh: ISD = ISD::FSINH; break; diff --git a/llvm/include/llvm/CodeGen/EarlyIfConversion.h b/llvm/include/llvm/CodeGen/EarlyIfConversion.h new file mode 100644 index 0000000000000000000000000000000000000000..78bf12ade02c3d2ed03557962b0d8d11bcff08b6 --- /dev/null +++ b/llvm/include/llvm/CodeGen/EarlyIfConversion.h @@ -0,0 +1,24 @@ +//===- llvm/CodeGen/EarlyIfConversion.h -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_EARLYIFCONVERSION_H +#define LLVM_CODEGEN_EARLYIFCONVERSION_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class EarlyIfConverterPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_EARLYIFCONVERSION_H diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 82e713f30ea31c89aba6f8ef30f66138bf47a5df..bcd44abb2088a0434258d3c95b7c346c8890de94 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -599,11 +599,22 @@ public: LegalizeRuleSet &legalFor(std::initializer_list Types) { return actionFor(LegalizeAction::Legal, Types); } + LegalizeRuleSet &legalFor(bool Pred, std::initializer_list Types) { + if (!Pred) + return *this; + return actionFor(LegalizeAction::Legal, Types); + } /// The instruction is legal when type indexes 0 and 1 is any type pair in the /// given list. LegalizeRuleSet &legalFor(std::initializer_list> Types) { return actionFor(LegalizeAction::Legal, Types); } + LegalizeRuleSet &legalFor(bool Pred, + std::initializer_list> Types) { + if (!Pred) + return *this; + return actionFor(LegalizeAction::Legal, Types); + } /// The instruction is legal when type index 0 is any type in the given list /// and imm index 0 is anything. LegalizeRuleSet &legalForTypeWithAnyImm(std::initializer_list Types) { @@ -749,6 +760,12 @@ public: return actionFor(LegalizeAction::Libcall, Types); } LegalizeRuleSet & + libcallFor(bool Pred, std::initializer_list> Types) { + if (!Pred) + return *this; + return actionFor(LegalizeAction::Libcall, Types); + } + LegalizeRuleSet & libcallForCartesianProduct(std::initializer_list Types) { return actionForCartesianProduct(LegalizeAction::Libcall, Types); } @@ -846,12 +863,23 @@ public: LegalizeRuleSet &customFor(std::initializer_list Types) { return actionFor(LegalizeAction::Custom, Types); } + LegalizeRuleSet &customFor(bool Pred, std::initializer_list Types) { + if (!Pred) + return *this; + return actionFor(LegalizeAction::Custom, Types); + } - /// The instruction is custom when type indexes 0 and 1 is any type pair in the - /// given list. + /// The instruction is custom when type indexes 0 and 1 is any type pair in + /// the given list. LegalizeRuleSet &customFor(std::initializer_list> Types) { return actionFor(LegalizeAction::Custom, Types); } + LegalizeRuleSet &customFor(bool Pred, + std::initializer_list> Types) { + if (!Pred) + return *this; + return actionFor(LegalizeAction::Custom, Types); + } LegalizeRuleSet &customForCartesianProduct(std::initializer_list Types) { return actionForCartesianProduct(LegalizeAction::Custom, Types); @@ -990,6 +1018,11 @@ public: scalarNarrowerThan(TypeIdx, Ty.getSizeInBits()), changeTo(typeIdx(TypeIdx), Ty)); } + LegalizeRuleSet &minScalar(bool Pred, unsigned TypeIdx, const LLT Ty) { + if (!Pred) + return *this; + return minScalar(TypeIdx, Ty); + } /// Ensure the scalar is at least as wide as Ty if condition is met. LegalizeRuleSet &minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index da43f5be10ff3bd6479a98ad203fdb58a60a1e26..0b6d155b6d161e569facda3449708c497bc674cc 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -425,6 +425,7 @@ enum NodeType { STRICT_FASIN, STRICT_FACOS, STRICT_FATAN, + STRICT_FATAN2, STRICT_FSINH, STRICT_FCOSH, STRICT_FTANH, @@ -994,6 +995,8 @@ enum NodeType { FPOWI, /// FLDEXP - ldexp, inspired by libm (op0 * 2**op1). FLDEXP, + /// FATAN2 - atan2, inspired by libm. + FATAN2, /// FFREXP - frexp, extract fractional and exponent component of a /// floating-point value. Returns the two components as separate return diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h index 4d93213de5e07012247efd24aed91c725bc1bbed..0f2898d3554d0620fe86acab3f7142d88fb78a38 100644 --- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h +++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h @@ -45,7 +45,7 @@ struct VRegInfo { } D; Register VReg; Register PreferredReg; - std::vector Flags; + uint8_t Flags = 0; }; using Name2RegClassMap = StringMap; diff --git a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h index c7d97597d551cdfad2c9a2547fa0044a99db1083..d51de24d64e8d7d998919eb3d062775b7a8be93e 100644 --- a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h +++ b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h @@ -46,12 +46,13 @@ #ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H #define LLVM_CODEGEN_MACHINETRACEMETRICS_H -#include "llvm/ADT/SparseSet.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/TargetSchedule.h" namespace llvm { @@ -93,7 +94,7 @@ enum class MachineTraceStrategy { TS_NumStrategies }; -class MachineTraceMetrics : public MachineFunctionPass { +class MachineTraceMetrics { const MachineFunction *MF = nullptr; const TargetInstrInfo *TII = nullptr; const TargetRegisterInfo *TRI = nullptr; @@ -102,19 +103,25 @@ class MachineTraceMetrics : public MachineFunctionPass { TargetSchedModel SchedModel; public: + friend class MachineTraceMetricsWrapperPass; friend class Ensemble; friend class Trace; class Ensemble; - static char ID; + // For legacy pass. + MachineTraceMetrics() = default; + + explicit MachineTraceMetrics(MachineFunction &MF, const MachineLoopInfo &LI) { + init(MF, LI); + } - MachineTraceMetrics(); + MachineTraceMetrics(MachineTraceMetrics &&) = default; - void getAnalysisUsage(AnalysisUsage&) const override; - bool runOnMachineFunction(MachineFunction&) override; - void releaseMemory() override; - void verifyAnalysis() const override; + ~MachineTraceMetrics(); + + void init(MachineFunction &Func, const MachineLoopInfo &LI); + void clear(); /// Per-basic block information that doesn't depend on the trace through the /// block. @@ -400,6 +407,12 @@ public: /// Call Ensemble::getTrace() again to update any trace handles. void invalidate(const MachineBasicBlock *MBB); + /// Handle invalidation explicitly. + bool invalidate(MachineFunction &, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &); + + void verifyAnalysis() const; + private: // One entry per basic block, indexed by block number. SmallVector BlockInfo; @@ -412,8 +425,8 @@ private: SmallVector ProcReleaseAtCycles; // One ensemble per strategy. - Ensemble - *Ensembles[static_cast(MachineTraceStrategy::TS_NumStrategies)]; + std::unique_ptr + Ensembles[static_cast(MachineTraceStrategy::TS_NumStrategies)]; // Convert scaled resource usage to a cycle count that can be compared with // latencies. @@ -435,6 +448,38 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } +class MachineTraceMetricsAnalysis + : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = MachineTraceMetrics; + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); +}; + +/// Verifier pass for \c MachineTraceMetrics. +struct MachineTraceMetricsVerifierPass + : PassInfoMixin { + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + +class MachineTraceMetricsWrapperPass : public MachineFunctionPass { +public: + static char ID; + MachineTraceMetrics MTM; + + MachineTraceMetricsWrapperPass(); + + void getAnalysisUsage(AnalysisUsage &) const override; + bool runOnMachineFunction(MachineFunction &) override; + void releaseMemory() override { MTM.clear(); } + void verifyAnalysis() const override { MTM.verifyAnalysis(); } + MachineTraceMetrics &getMTM() { return MTM; } +}; + } // end namespace llvm #endif // LLVM_CODEGEN_MACHINETRACEMETRICS_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 99421bdf769ffa2b43282314d2877457a11875c5..bbbf99626098a6181c22ccf5772fbbf0990a6d3e 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -273,7 +273,7 @@ namespace llvm { /// EarlyIfConverter - This pass performs if-conversion on SSA form by /// inserting cmov instructions. - extern char &EarlyIfConverterID; + extern char &EarlyIfConverterLegacyID; /// EarlyIfPredicator - This pass performs if-conversion on SSA form by /// predicating if/else block and insert select at the join point. diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 0af4f73b869c3ca2490372d6b0f6ad6dcabf3412..b3e249b7ebd5c4aec4d6937fdcbd0a6970d7744c 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -759,6 +759,16 @@ inline BinaryOpc_match m_Srl(const LHS &L, const RHS &R) { return BinaryOpc_match(ISD::SRL, L, R); } +template +inline BinaryOpc_match m_Rotl(const LHS &L, const RHS &R) { + return BinaryOpc_match(ISD::ROTL, L, R); +} + +template +inline BinaryOpc_match m_Rotr(const LHS &L, const RHS &R) { + return BinaryOpc_match(ISD::ROTR, L, R); +} + template inline BinaryOpc_match m_FAdd(const LHS &L, const RHS &R) { return BinaryOpc_match(ISD::FADD, L, R); @@ -822,6 +832,11 @@ inline UnaryOpc_match m_ChainedUnaryOp(unsigned Opc, return UnaryOpc_match(Opc, Op); } +template +inline UnaryOpc_match m_BSwap(const Opnd &Op) { + return UnaryOpc_match(ISD::BSWAP, Op); +} + template inline UnaryOpc_match m_BitReverse(const Opnd &Op) { return UnaryOpc_match(ISD::BITREVERSE, Op); @@ -892,10 +907,18 @@ template inline UnaryOpc_match m_FPToSI(const Opnd &Op) { return UnaryOpc_match(ISD::FP_TO_SINT, Op); } +template inline UnaryOpc_match m_Ctpop(const Opnd &Op) { + return UnaryOpc_match(ISD::CTPOP, Op); +} + template inline UnaryOpc_match m_Ctlz(const Opnd &Op) { return UnaryOpc_match(ISD::CTLZ, Op); } +template inline UnaryOpc_match m_Cttz(const Opnd &Op) { + return UnaryOpc_match(ISD::CTTZ, Op); +} + // === Constants === struct ConstantInt_match { APInt *BindVal; diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index 3df9e56db38a436bf02eaac541c5fd2ec7729236..97de0197da9b400ac0ae02067ba207abdda34220 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -277,16 +277,14 @@ public: return false; } - /// Return true if the target wants to keep the frame pointer regardless of - /// the function attribute "frame-pointer". - virtual bool keepFramePointer(const MachineFunction &MF) const { - return false; - } - /// hasFP - Return true if the specified function should have a dedicated /// frame pointer register. For most targets this is true only if the function /// has variable sized allocas or if frame pointer elimination is disabled. - virtual bool hasFP(const MachineFunction &MF) const = 0; + /// For all targets, this is false if the function has the naked attribute + /// since there is no prologue to set up the frame pointer. + bool hasFP(const MachineFunction &MF) const { + return !MF.getFunction().hasFnAttribute(Attribute::Naked) && hasFPImpl(MF); + } /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is /// not required, we reserve argument space for call sites in the function @@ -483,6 +481,9 @@ public: /// targets can emit remarks based on the final frame layout. virtual void emitRemarks(const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const {}; + +protected: + virtual bool hasFPImpl(const MachineFunction &MF) const = 0; }; } // End llvm namespace diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 5ab31a687ec5e91c0f53f021a37418aa176b8c6e..61615cb0f7b30142a7c2d377a0ec95748b56bd12 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5567,9 +5567,7 @@ public: /// If this function returns true, SelectionDAGBuilder emits a /// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector. - virtual bool useLoadStackGuardNode() const { - return false; - } + virtual bool useLoadStackGuardNode(const Module &M) const { return false; } virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h index f8d24d8bf31ca0606146d10c9e590b775f01c6a3..efe8182934dd760311d1355c821b4e6dac100826 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h @@ -39,12 +39,8 @@ enum EdgeKind_i386 : Edge::Kind { /// Represents a data/control flow instruction using PC-relative addressing /// to a target. /// - /// The fixup expression for this kind includes an implicit offset to account - /// for the PC (unlike the Delta edges) so that a PCRel32 with a target - /// T and addend zero is a call/branch to the start (offset zero) of T. - /// /// Fixup expression: - /// Fixup <- Target - (Fixup + 4) + Addend : int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -68,12 +64,8 @@ enum EdgeKind_i386 : Edge::Kind { /// Represents a data/control flow instruction using PC-relative addressing /// to a target. /// - /// The fixup expression for this kind includes an implicit offset to account - /// for the PC (unlike the Delta edges) so that a PCRel16 with a target - /// T and addend zero is a call/branch to the start (offset zero) of T. - /// /// Fixup expression: - /// Fixup <- Target - (Fixup + 4) + Addend : int16 + /// Fixup <- Target - Fixup + Addend : int16 /// /// Errors: /// - The result of the fixup expression must fit into an int16, otherwise @@ -86,7 +78,7 @@ enum EdgeKind_i386 : Edge::Kind { /// Delta from the fixup to the target. /// /// Fixup expression: - /// Fixup <- Target - Fixup + Addend : int64 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -130,12 +122,8 @@ enum EdgeKind_i386 : Edge::Kind { /// Represents a PC-relative call or branch to a target. This can be used to /// identify, record, and/or patch call sites. /// - /// The fixup expression for this kind includes an implicit offset to account - /// for the PC (unlike the Delta edges) so that a Branch32PCRel with a target - /// T and addend zero is a call/branch to the start (offset zero) of T. - /// /// Fixup expression: - /// Fixup <- Target - (Fixup + 4) + Addend : int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -164,7 +152,7 @@ enum EdgeKind_i386 : Edge::Kind { /// target may be recorded to allow manipulation at runtime. /// /// Fixup expression: - /// Fixup <- Target - Fixup + Addend - 4 : int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -180,7 +168,7 @@ enum EdgeKind_i386 : Edge::Kind { /// is within range of the fixup location. /// /// Fixup expression: - /// Fixup <- Target - Fixup + Addend - 4: int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -215,8 +203,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, } case i386::PCRel32: { - int32_t Value = - E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); + int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); *(little32_t *)FixupPtr = Value; break; } @@ -231,8 +218,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, } case i386::PCRel16: { - int32_t Value = - E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); + int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); if (LLVM_LIKELY(isInt<16>(Value))) *(little16_t *)FixupPtr = Value; else @@ -257,8 +243,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, case i386::BranchPCRel32: case i386::BranchPCRel32ToPtrJumpStub: case i386::BranchPCRel32ToPtrJumpStubBypassable: { - int32_t Value = - E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); + int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); *(little32_t *)FixupPtr = Value; break; } diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h index 24cf982fc3ab0fe90dda5fee88831a17aed7f199..0d7e0fdb5820b51c9c5c17364e91e6f194644343 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h @@ -142,6 +142,24 @@ enum EdgeKind_x86_64 : Edge::Kind { /// an out-of-range error will be returned. NegDelta32, + /// A 64-bit size relocation. + /// + /// Fixup expression: + /// Fixup <- Size + Addend : uint64 + /// + Size64, + + /// A 32-bit size relocation. + /// + /// Fixup expression: + /// Fixup <- Size + Addend : uint32 + /// + /// Errors: + /// - The result of the fixup expression must fit into an uint32, otherwise + /// an out-of-range error will be returned. + /// + Size32, + /// A 64-bit GOT delta. /// /// Delta from the global offset table to the target @@ -531,6 +549,22 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, return makeTargetOutOfRangeError(G, B, E); break; } + + case Size64: { + uint64_t Value = E.getTarget().getSize() + E.getAddend(); + *(ulittle64_t *)FixupPtr = Value; + break; + } + + case Size32: { + uint64_t Value = E.getTarget().getSize() + E.getAddend(); + if (LLVM_LIKELY(isUInt<32>(Value))) + *(ulittle32_t *)FixupPtr = Value; + else + return makeTargetOutOfRangeError(G, B, E); + break; + } + case Delta64FromGOT: { assert(GOTSymbol && "No GOT section symbol"); int64_t Value = diff --git a/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h index 6bbc9b211333d54de7d551463c43868afc24b296..f44b6b3860fc314479274b6b2a92599201abf466 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h @@ -40,18 +40,16 @@ public: /// Try to create a COFFPlatform instance, adding the ORC runtime to the /// given JITDylib. static Expected> - Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, + Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, std::unique_ptr OrcRuntimeArchiveBuffer, LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime = false, const char *VCRuntimePath = nullptr, std::optional RuntimeAliases = std::nullopt); static Expected> - Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, const char *OrcRuntimePath, - LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime = false, - const char *VCRuntimePath = nullptr, + Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, + const char *OrcRuntimePath, LoadDynamicLibrary LoadDynLibrary, + bool StaticVCRuntime = false, const char *VCRuntimePath = nullptr, std::optional RuntimeAliases = std::nullopt); ExecutionSession &getExecutionSession() const { return ES; } @@ -138,8 +136,7 @@ private: static bool supportedTarget(const Triple &TT); COFFPlatform( - ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, + ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, std::unique_ptr OrcRuntimeGenerator, std::unique_ptr OrcRuntimeArchiveBuffer, std::unique_ptr OrcRuntimeArchive, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h index 44db455f218a4758b46865cf97650b83a8305ec8..741dcc236b300e25c8f479a13fef4992f511484b 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h @@ -268,11 +268,21 @@ private: /// the containing object being added to the JITDylib. class StaticLibraryDefinitionGenerator : public DefinitionGenerator { public: - // Interface builder function for objects loaded from this archive. + /// Interface builder function for objects loaded from this archive. using GetObjectFileInterface = unique_function( ExecutionSession &ES, MemoryBufferRef ObjBuffer)>; + /// Callback for visiting archive members at construction time. + /// Con be used to pre-load archive members. + using VisitMembersFunction = unique_function; + + /// A VisitMembersFunction that unconditionally loads all object files from + /// the archive. + /// Archive members that are not valid object files will be skipped. + static VisitMembersFunction loadAllObjectFileMembers(ObjectLayer &L, + JITDylib &JD); + /// Try to create a StaticLibraryDefinitionGenerator from the given path. /// /// This call will succeed if the file at the given path is a static library @@ -280,6 +290,7 @@ public: /// with the ExecutionSession's triple. Otherwise it will return an error. static Expected> Load(ObjectLayer &L, const char *FileName, + VisitMembersFunction VisitMembers = VisitMembersFunction(), GetObjectFileInterface GetObjFileInterface = GetObjectFileInterface()); /// Try to create a StaticLibrarySearchGenerator from the given memory buffer @@ -287,6 +298,7 @@ public: static Expected> Create(ObjectLayer &L, std::unique_ptr ArchiveBuffer, std::unique_ptr Archive, + VisitMembersFunction VisitMembers = VisitMembersFunction(), GetObjectFileInterface GetObjFileInterface = GetObjectFileInterface()); /// Try to create a StaticLibrarySearchGenerator from the given memory buffer. @@ -298,6 +310,7 @@ public: /// with the ExecutionSession's triple. Otherwise it will return an error. static Expected> Create(ObjectLayer &L, std::unique_ptr ArchiveBuffer, + VisitMembersFunction VisitMembers = VisitMembersFunction(), GetObjectFileInterface GetObjFileInterface = GetObjectFileInterface()); /// Returns a list of filenames of dynamic libraries that this archive has diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h index 6468f2dfc11ad0b58b3da6592aab5172269d5123..06bc85dc40a8d5c141c9fceab5da3621fabd621f 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h @@ -235,8 +235,6 @@ public: } /// Look up and SPS-deserialize a bootstrap map value. - /// - /// template Error getBootstrapMapValue(StringRef Key, std::optional &Val) const { Val = std::nullopt; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h index 0ae7fc80acada535a94ed46b6fccacf725581398..a9d34a82d53d2646668499d4faf2fba0df901e91 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h @@ -28,6 +28,9 @@ class MachOUniversalBinary; namespace orc { +class JITDylib; +class ObjectLayer; + /// Check that the given buffer contains a MachO object file compatible with the /// given triple. /// ObjIsSlice should be set to true if Obj is a slice of a universal binary @@ -72,6 +75,20 @@ getMachOSliceRangeForTriple(object::MachOUniversalBinary &UB, const Triple &TT); Expected> getMachOSliceRangeForTriple(MemoryBufferRef UBBuf, const Triple &TT); +/// For use with StaticLibraryDefinitionGenerators. +class ForceLoadMachOArchiveMembers { +public: + ForceLoadMachOArchiveMembers(ObjectLayer &L, JITDylib &JD, bool ObjCOnly) + : L(L), JD(JD), ObjCOnly(ObjCOnly) {} + + Error operator()(MemoryBufferRef MemberBuf); + +private: + ObjectLayer &L; + JITDylib &JD; + bool ObjCOnly; +}; + } // namespace orc } // namespace llvm diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 5449c2f4e98bd836e2be614146a9160eb957f72a..e5ffbcccb85d5664849614a99e63b3390aaa068a 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -1187,7 +1187,7 @@ def OMP_Workshare : Directive<"workshare"> { let category = CA_Executable; } def OMP_EndWorkshare : Directive<"end workshare"> { - let allowedClauses = [ + let allowedOnceClauses = [ VersionedClause, ]; let leafConstructs = OMP_Workshare.leafConstructs; diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h index 0ffcb05519d44fc243f076fbf298dd9d4077e8a2..5be58d7eca0606bcfaa2d10c60aa62297c924ffa 100644 --- a/llvm/include/llvm/IR/Argument.h +++ b/llvm/include/llvm/IR/Argument.h @@ -182,6 +182,8 @@ public: Attribute getAttribute(Attribute::AttrKind Kind) const; + AttributeSet getAttributes() const; + /// Method for support type inquiry through isa, cast, and dyn_cast. static bool classof(const Value *V) { return V->getValueID() == ArgumentVal; diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 57db52e4879b5bd7304b6a36335f3ed948c245df..2755ced404dddb5fb63c3ed697464395deb607e2 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -947,6 +947,9 @@ public: /// arg. uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const; + /// Get range (or std::nullopt if unknown) of an arg. + std::optional getParamRange(unsigned ArgNo) const; + /// Get the disallowed floating-point classes of the return value. FPClassTest getRetNoFPClass() const; @@ -1123,6 +1126,10 @@ public: /// invalid if the Kind is not present in the builder. Attribute getAttribute(StringRef Kind) const; + /// Retrieve the range if the attribute exists (std::nullopt is returned + /// otherwise). + std::optional getRange() const; + /// Return raw (possibly packed/encoded) value of integer attribute or /// std::nullopt if not set. std::optional getRawIntAttr(Attribute::AttrKind Kind) const; @@ -1288,11 +1295,17 @@ enum AttributeSafetyKind : uint8_t { /// follows the same type rules as FPMathOperator. bool isNoFPClassCompatibleType(Type *Ty); -/// Which attributes cannot be applied to a type. The argument \p ASK indicates, -/// if only attributes that are known to be safely droppable are contained in -/// the mask; only attributes that might be unsafe to drop (e.g., ABI-related -/// attributes) are in the mask; or both. -AttributeMask typeIncompatible(Type *Ty, AttributeSafetyKind ASK = ASK_ALL); +/// Which attributes cannot be applied to a type. The argument \p AS +/// is used as a hint for the attributes whose compatibility is being +/// checked against \p Ty. This does not mean the return will be a +/// subset of \p AS, just that attributes that have specific dynamic +/// type compatibilities (i.e `range`) will be checked against what is +/// contained in \p AS. The argument \p ASK indicates, if only +/// attributes that are known to be safely droppable are contained in +/// the mask; only attributes that might be unsafe to drop (e.g., +/// ABI-related attributes) are in the mask; or both. +AttributeMask typeIncompatible(Type *Ty, AttributeSet AS, + AttributeSafetyKind ASK = ASK_ALL); /// Get param/return attributes which imply immediate undefined behavior if an /// invalid value is passed. For example, this includes noundef (where undef diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def index 56304c377b8393d8bc54ea5966bcb6d3b76a4925..30a82bf633d5757c756c4725f57a187151e6bb94 100644 --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -72,6 +72,7 @@ CMP_INSTRUCTION(FCmp, 2, 0, experimental_constrained_fcmps, FSETCCS DAG_FUNCTION(acos, 1, 1, experimental_constrained_acos, FACOS) DAG_FUNCTION(asin, 1, 1, experimental_constrained_asin, FASIN) DAG_FUNCTION(atan, 1, 1, experimental_constrained_atan, FATAN) +DAG_FUNCTION(atan2, 2, 1, experimental_constrained_atan2, FATAN2) DAG_FUNCTION(ceil, 1, 0, experimental_constrained_ceil, FCEIL) DAG_FUNCTION(cos, 1, 1, experimental_constrained_cos, FCOS) DAG_FUNCTION(cosh, 1, 1, experimental_constrained_cosh, FCOSH) diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 86d88da3d9460ec1bd61b0103de6fc81886f2079..99f72792ce4024ceabdd88d554a3f4bb0441d629 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1453,14 +1453,22 @@ public: /// looking through to the attributes on the called function when necessary). ///@{ - /// Return the parameter attributes for this call. - /// + /// Return the attributes for this call. AttributeList getAttributes() const { return Attrs; } - /// Set the parameter attributes for this call. - /// + /// Set the attributes for this call. void setAttributes(AttributeList A) { Attrs = A; } + /// Return the return attributes for this call. + AttributeSet getRetAttributes() const { + return getAttributes().getRetAttrs(); + } + + /// Return the param attributes for this call. + AttributeSet getParamAttributes(unsigned ArgNo) const { + return getAttributes().getParamAttrs(ArgNo); + } + /// Try to intersect the attributes from 'this' CallBase and the /// 'Other' CallBase. Sets the intersected attributes to 'this' and /// return true if successful. Doesn't modify 'this' and returns diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 63d93ba5f5bc077c2b120c948dedc6f8306fadb2..aca0025ce3ca78fe944be0c9f3b1bd5ba48999c6 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -4961,6 +4961,16 @@ inline Align getLoadStoreAlignment(const Value *I) { return cast(I)->getAlign(); } +/// A helper function that set the alignment of load or store instruction. +inline void setLoadStoreAlignment(Value *I, Align NewAlign) { + assert((isa(I) || isa(I)) && + "Expected Load or Store instruction"); + if (auto *LI = dyn_cast(I)) + LI->setAlignment(NewAlign); + else + cast(I)->setAlignment(NewAlign); +} + /// A helper function that returns the address space of the pointer operand of /// load or store instruction. inline unsigned getLoadStoreAddressSpace(const Value *I) { diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index 49f4fe4c5c3d7f1a3407c3ff0a32b69a5ec283c0..e893295e3272b908d6f38330750ce164b421e74f 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -102,6 +102,16 @@ namespace Intrinsic { inline Function *getDeclaration(Module *M, ID id, ArrayRef Tys = {}) { return getOrInsertDeclaration(M, id, Tys); } + + /// Look up the Function declaration of the intrinsic \p id in the Module + /// \p M and return it if it exists. Otherwise, return nullptr. This version + /// supports non-overloaded intrinsics. + Function *getDeclarationIfExists(const Module *M, ID id); + + /// This version supports overloaded intrinsics. + Function *getDeclarationIfExists(Module *M, ID id, ArrayRef Tys, + FunctionType *FT = nullptr); + /// Looks up Name in NameTable via binary search. NameTable must be sorted /// and all entries must start with "llvm.". If NameTable contains an exact /// match for Name or a prefix of Name followed by a dot, its index in diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 3821a1869741decc5fce67082d9955d91a72af07..c70271cc496148168abf66f1e781e72c3c70b422 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1235,6 +1235,11 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_atan2 : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_sin : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, llvm_metadata_ty, diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 45aea1ccdb6d4c46377da060c29af9649da28333..27a437a83be6dd6de8270e8eec9a04047ae3ffc3 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -86,6 +86,7 @@ def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ def int_dx_rsqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>; def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; +def int_dx_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>; def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>; def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 3d61456589ee0d1845022e479d68ddce875f88dc..6df2eb156a077494686ab19212358744340a38a9 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -84,6 +84,7 @@ let TargetPrefix = "spv" in { [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], [IntrNoMem, Commutative] >; def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; + def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>; def int_spv_radians : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 8c6b7895470b8ddfde2ea91ef3fb5721e96297d7..c3349c9772c7adb040cd80b19ff825604be2e285 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2387,6 +2387,32 @@ m_UnordFMin(const LHS &L, const RHS &R) { return MaxMin_match(L, R); } +/// Match an 'ordered' or 'unordered' floating point maximum function. +/// Floating point has one special value 'NaN'. Therefore, there is no total +/// order. However, if we can ignore the 'NaN' value (for example, because of a +/// 'no-nans-float-math' flag) a combination of a fcmp and select has 'maximum' +/// semantics. +template +inline match_combine_or, + MaxMin_match> +m_OrdOrUnordFMax(const LHS &L, const RHS &R) { + return m_CombineOr(MaxMin_match(L, R), + MaxMin_match(L, R)); +} + +/// Match an 'ordered' or 'unordered' floating point minimum function. +/// Floating point has one special value 'NaN'. Therefore, there is no total +/// order. However, if we can ignore the 'NaN' value (for example, because of a +/// 'no-nans-float-math' flag) a combination of a fcmp and select has 'minimum' +/// semantics. +template +inline match_combine_or, + MaxMin_match> +m_OrdOrUnordFMin(const LHS &L, const RHS &R) { + return m_CombineOr(MaxMin_match(L, R), + MaxMin_match(L, R)); +} + /// Matches a 'Not' as 'xor V, -1' or 'xor -1, V'. /// NOTE: we first match the 'Not' (by matching '-1'), /// and only then match the inner matcher! diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index b775a067f090c4792b32717ab37737738631cf67..94f7dcf21a3812de039dacf25771764cab567adc 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -232,6 +232,11 @@ HANDLE_LIBCALL(ATAN_F64, "atan") HANDLE_LIBCALL(ATAN_F80, "atanl") HANDLE_LIBCALL(ATAN_F128,"atanl") HANDLE_LIBCALL(ATAN_PPCF128, "atanl") +HANDLE_LIBCALL(ATAN2_F32, "atan2f") +HANDLE_LIBCALL(ATAN2_F64, "atan2") +HANDLE_LIBCALL(ATAN2_F80, "atan2l") +HANDLE_LIBCALL(ATAN2_F128,"atan2l") +HANDLE_LIBCALL(ATAN2_PPCF128, "atan2l") HANDLE_LIBCALL(SINCOS_F32, nullptr) HANDLE_LIBCALL(SINCOS_F64, nullptr) HANDLE_LIBCALL(SINCOS_F80, nullptr) diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index a8d7d3d3a84216971133376099a568f767711acd..7c1ee70af42949fa696cf97ea11fee7a476b2990 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -98,7 +98,7 @@ void initializeDominatorTreeWrapperPassPass(PassRegistry &); void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &); void initializeEarlyCSELegacyPassPass(PassRegistry &); void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &); -void initializeEarlyIfConverterPass(PassRegistry &); +void initializeEarlyIfConverterLegacyPass(PassRegistry &); void initializeEarlyIfPredicatorPass(PassRegistry &); void initializeEarlyMachineLICMPass(PassRegistry &); void initializeEarlyTailDuplicatePass(PassRegistry &); @@ -209,7 +209,7 @@ void initializeMachineRegionInfoPassPass(PassRegistry &); void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &); void initializeMachineSchedulerPass(PassRegistry &); void initializeMachineSinkingPass(PassRegistry &); -void initializeMachineTraceMetricsPass(PassRegistry &); +void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &); void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &); void initializeMachineUniformityAnalysisPassPass(PassRegistry &); void initializeMachineVerifierLegacyPassPass(PassRegistry &); diff --git a/llvm/include/llvm/LTO/legacy/LTOModule.h b/llvm/include/llvm/LTO/legacy/LTOModule.h index 1b2de3b333855c8a5d1c5440886cea66daf73817..e861a56bcbac1dfb222b8b23b30c802fec21f80a 100644 --- a/llvm/include/llvm/LTO/legacy/LTOModule.h +++ b/llvm/include/llvm/LTO/legacy/LTOModule.h @@ -195,7 +195,7 @@ private: /// Add a function symbol as defined to the list. void addDefinedFunctionSymbol(ModuleSymbolTable::Symbol Sym); - void addDefinedFunctionSymbol(StringRef Name, const Function *F); + void addDefinedFunctionSymbol(StringRef Name, const GlobalValue *F); /// Add a global symbol from module-level ASM to the defined list. void addAsmGlobalSymbol(StringRef, lto_symbol_attributes scope); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 0d45df08cb0ca7b9d4faf093b419b6e4b002351c..9ef6e39dbb1cdd3262731bf704c7f2d4b736d665 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -27,6 +27,7 @@ #include "llvm/CodeGen/CodeGenPrepare.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" #include "llvm/CodeGen/DwarfEHPrepare.h" +#include "llvm/CodeGen/EarlyIfConversion.h" #include "llvm/CodeGen/ExpandLargeDivRem.h" #include "llvm/CodeGen/ExpandLargeFpConvert.h" #include "llvm/CodeGen/ExpandMemCmp.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 2aa5f4fc176aba56ec3df375944fa753b5a3e320..4e44d0312ede47341f031969c1e91a865ca2c7b7 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("machine-opt-remark-emitter", MachineOptimizationRemarkEmitterAnalysis()) MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree", MachinePostDominatorTreeAnalysis()) +MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis()) // MACHINE_FUNCTION_ANALYSIS("live-stacks", LiveStacksPass()) @@ -119,8 +120,6 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis()) // MachinePostDominatorTreeAnalysis()) // MACHINE_FUNCTION_ANALYSIS("machine-region-info", // MachineRegionInfoPassAnalysis()) -// MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", -// MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("reaching-def", // ReachingDefAnalysisAnalysis()) MACHINE_FUNCTION_ANALYSIS("live-reg-matrix", // LiveRegMatrixAnalysis()) MACHINE_FUNCTION_ANALYSIS("gc-analysis", // GCMachineCodeAnalysisPass()) @@ -130,6 +129,7 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis()) #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass()) +MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass()) MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass()) MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass()) MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass()) @@ -156,6 +156,7 @@ MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass()) MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) MACHINE_FUNCTION_PASS("two-address-instruction", TwoAddressInstructionPass()) MACHINE_FUNCTION_PASS("verify", MachineVerifierPass()) +MACHINE_FUNCTION_PASS("verify", MachineTraceMetricsVerifierPass()) #undef MACHINE_FUNCTION_PASS #ifndef MACHINE_FUNCTION_PASS_WITH_PARAMS @@ -205,7 +206,6 @@ DUMMY_MACHINE_FUNCTION_PASS("cfi-fixup", CFIFixupPass) DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass) DUMMY_MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass) DUMMY_MACHINE_FUNCTION_PASS("dot-machine-cfg", MachineCFGPrinter) -DUMMY_MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass) DUMMY_MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass) DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass) DUMMY_MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", FixupStatepointCallerSavedPass) diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index b9df3266fbcf8fa188ec5748aefaa768f9803d60..c66b0465a0b54854c28dd92729bb812c49278135 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -303,6 +303,18 @@ COVMAP_HEADER(uint32_t, Int32Ty, Version, \ #undef COVMAP_HEADER /* COVMAP_HEADER end. */ +/* COVINIT_FUNC start */ +#ifndef COVINIT_FUNC +#define COVINIT_FUNC(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_DATA_DEFINED +#endif +COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), WriteoutFunction, \ + WriteoutF) +COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), ResetFunction, \ + ResetF) +#undef COVINIT_FUNC +/* COVINIT_FUNC end */ #ifdef INSTR_PROF_SECT_ENTRY #define INSTR_PROF_DATA_DEFINED @@ -345,6 +357,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_covdata, \ INSTR_PROF_SECT_ENTRY(IPSK_covname, \ INSTR_PROF_QUOTE(INSTR_PROF_COVNAME_COMMON), \ INSTR_PROF_COVNAME_COFF, "__LLVM_COV,") +INSTR_PROF_SECT_ENTRY(IPSK_covinit, \ + INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON), \ + INSTR_PROF_COVINIT_COFF, "__LLVM_COV,") #undef INSTR_PROF_SECT_ENTRY #endif @@ -761,6 +776,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_COVDATA_COMMON __llvm_covdata #define INSTR_PROF_COVNAME_COMMON __llvm_covnames #define INSTR_PROF_ORDERFILE_COMMON __llvm_orderfile +#define INSTR_PROF_COVINIT_COMMON __llvm_covinit + /* Windows section names. Because these section names contain dollar characters, * they must be quoted. */ @@ -781,6 +798,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_COVNAME_COFF ".lcovn" #define INSTR_PROF_ORDERFILE_COFF ".lorderfile$M" +// FIXME: Placeholder for Windows. Windows currently does not initialize +// the GCOV functions in the runtime. +#define INSTR_PROF_COVINIT_COFF ".lcovd$M" + #ifdef _WIN32 /* Runtime section names and name strings. */ #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF @@ -800,6 +821,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_COVDATA_SECT_NAME INSTR_PROF_COVDATA_COFF #define INSTR_PROF_COVNAME_SECT_NAME INSTR_PROF_COVNAME_COFF #define INSTR_PROF_ORDERFILE_SECT_NAME INSTR_PROF_ORDERFILE_COFF +#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_COVINIT_COFF #else /* Runtime section names and name strings. */ #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON) @@ -821,6 +843,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, /* Order file instrumentation. */ #define INSTR_PROF_ORDERFILE_SECT_NAME \ INSTR_PROF_QUOTE(INSTR_PROF_ORDERFILE_COMMON) +#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON) #endif #define INSTR_PROF_ORDERFILE_BUFFER_NAME _llvm_order_file_buffer diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h index 211f10f5d57c5c8ef2551f38da37f6b9fcece566..5ed9d7442ee70cde30b825a65851c4da1a7304c4 100644 --- a/llvm/include/llvm/SandboxIR/Pass.h +++ b/llvm/include/llvm/SandboxIR/Pass.h @@ -43,7 +43,7 @@ public: LLVM_DUMP_METHOD virtual void dump() const; #endif /// Similar to print() but adds a newline. Used for testing. - void printPipeline(raw_ostream &OS) const { OS << Name << "\n"; } + virtual void printPipeline(raw_ostream &OS) const { OS << Name << "\n"; } }; /// A pass that runs on a sandbox::Function. diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h index 247c43615f5766f8045a25e34ee3dc9e29384141..e8221996bc8f0496f50b63b8bdc8aa5996f2c54c 100644 --- a/llvm/include/llvm/SandboxIR/PassManager.h +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -32,11 +32,20 @@ class Value; /// Base class. template class PassManager : public ParentPass { +public: + // CreatePassFunc(StringRef PassName, StringRef PassArgs). + using CreatePassFunc = + std::function(StringRef, StringRef)>; + protected: /// The list of passes that this pass manager will run. SmallVector> Passes; PassManager(StringRef Name) : ParentPass(Name) {} + PassManager(StringRef Name, StringRef Pipeline, CreatePassFunc CreatePass) + : ParentPass(Name) { + setPassPipeline(Pipeline, CreatePass); + } PassManager(const PassManager &) = delete; PassManager(PassManager &&) = default; virtual ~PassManager() = default; @@ -49,41 +58,125 @@ public: Passes.push_back(std::move(Pass)); } - using CreatePassFunc = - std::function(StringRef)>; - /// Parses \p Pipeline as a comma-separated sequence of pass names and sets /// the pass pipeline, using \p CreatePass to instantiate passes by name. /// - /// After calling this function, the PassManager contains only the specified - /// pipeline, any previously added passes are cleared. + /// Passes can have arguments, for example: + /// "pass1,pass2,pass3" + /// + /// The arguments between angle brackets are treated as a mostly opaque string + /// and each pass is responsible for parsing its arguments. The exception to + /// this are nested angle brackets, which must match pair-wise to allow + /// arguments to contain nested pipelines, like: + /// + /// "pass1,subpass3>" + /// + /// An empty args string is treated the same as no args, so "pass" and + /// "pass<>" are equivalent. void setPassPipeline(StringRef Pipeline, CreatePassFunc CreatePass) { static constexpr const char EndToken = '\0'; + static constexpr const char BeginArgsToken = '<'; + static constexpr const char EndArgsToken = '>'; static constexpr const char PassDelimToken = ','; assert(Passes.empty() && "setPassPipeline called on a non-empty sandboxir::PassManager"); + + // Accept an empty pipeline as a special case. This can be useful, for + // example, to test conversion to SandboxIR without running any passes on + // it. + if (Pipeline.empty()) + return; + // Add EndToken to the end to ease parsing. std::string PipelineStr = std::string(Pipeline) + EndToken; - int FlagBeginIdx = 0; - - for (auto [Idx, C] : enumerate(PipelineStr)) { - // Keep moving Idx until we find the end of the pass name. - bool FoundDelim = C == EndToken || C == PassDelimToken; - if (!FoundDelim) - continue; - unsigned Sz = Idx - FlagBeginIdx; - std::string PassName(&PipelineStr[FlagBeginIdx], Sz); - FlagBeginIdx = Idx + 1; + Pipeline = StringRef(PipelineStr); + auto AddPass = [this, CreatePass](StringRef PassName, StringRef PassArgs) { + if (PassName.empty()) { + errs() << "Found empty pass name.\n"; + exit(1); + } // Get the pass that corresponds to PassName and add it to the pass // manager. - auto Pass = CreatePass(PassName); + auto Pass = CreatePass(PassName, PassArgs); if (Pass == nullptr) { errs() << "Pass '" << PassName << "' not registered!\n"; exit(1); } addPass(std::move(Pass)); + }; + + enum class State { + ScanName, // reading a pass name + ScanArgs, // reading a list of args + ArgsEnded, // read the last '>' in an args list, must read delimiter next + } CurrentState = State::ScanName; + int PassBeginIdx = 0; + int ArgsBeginIdx; + StringRef PassName; + int NestedArgs = 0; + for (auto [Idx, C] : enumerate(Pipeline)) { + switch (CurrentState) { + case State::ScanName: + if (C == BeginArgsToken) { + // Save pass name for later and begin scanning args. + PassName = Pipeline.slice(PassBeginIdx, Idx); + ArgsBeginIdx = Idx + 1; + ++NestedArgs; + CurrentState = State::ScanArgs; + break; + } + if (C == EndArgsToken) { + errs() << "Unexpected '>' in pass pipeline.\n"; + exit(1); + } + if (C == EndToken || C == PassDelimToken) { + // Delimiter found, add the pass (with empty args), stay in the + // ScanName state. + AddPass(Pipeline.slice(PassBeginIdx, Idx), StringRef()); + PassBeginIdx = Idx + 1; + } + break; + case State::ScanArgs: + // While scanning args, we only care about making sure nesting of angle + // brackets is correct. + if (C == BeginArgsToken) { + ++NestedArgs; + break; + } + if (C == EndArgsToken) { + --NestedArgs; + if (NestedArgs == 0) { + // Done scanning args. + AddPass(PassName, Pipeline.slice(ArgsBeginIdx, Idx)); + CurrentState = State::ArgsEnded; + } else if (NestedArgs < 0) { + errs() << "Unexpected '>' in pass pipeline.\n"; + exit(1); + } + break; + } + if (C == EndToken) { + errs() << "Missing '>' in pass pipeline. End-of-string reached while " + "reading arguments for pass '" + << PassName << "'.\n"; + exit(1); + } + break; + case State::ArgsEnded: + // Once we're done scanning args, only a delimiter is valid. This avoids + // accepting strings like "foo" or "foobar". + if (C == EndToken || C == PassDelimToken) { + PassBeginIdx = Idx + 1; + CurrentState = State::ScanName; + } else { + errs() << "Expected delimiter or end-of-string after pass " + "arguments.\n"; + exit(1); + } + break; + } } } @@ -101,7 +194,7 @@ public: } #endif /// Similar to print() but prints one pass per line. Used for testing. - void printPipeline(raw_ostream &OS) const { + void printPipeline(raw_ostream &OS) const override { OS << this->getName() << "\n"; for (const auto &PassPtr : Passes) PassPtr->printPipeline(OS); @@ -112,12 +205,18 @@ class FunctionPassManager final : public PassManager { public: FunctionPassManager(StringRef Name) : PassManager(Name) {} + FunctionPassManager(StringRef Name, StringRef Pipeline, + CreatePassFunc CreatePass) + : PassManager(Name, Pipeline, CreatePass) {} bool runOnFunction(Function &F) final; }; class RegionPassManager final : public PassManager { public: RegionPassManager(StringRef Name) : PassManager(Name) {} + RegionPassManager(StringRef Name, StringRef Pipeline, + CreatePassFunc CreatePass) + : PassManager(Name, Pipeline, CreatePass) {} bool runOnRegion(Region &R) final; }; diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index adf8a75f620225141589fb9295df733eb059fef7..fa516fc9b101754cde8cb9e5ed9ed6e31cfdefe3 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -534,6 +534,7 @@ def ftan : SDNode<"ISD::FTAN" , SDTFPUnaryOp>; def fasin : SDNode<"ISD::FASIN" , SDTFPUnaryOp>; def facos : SDNode<"ISD::FACOS" , SDTFPUnaryOp>; def fatan : SDNode<"ISD::FATAN" , SDTFPUnaryOp>; +def fatan2 : SDNode<"ISD::FATAN2" , SDTFPBinOp>; def fsinh : SDNode<"ISD::FSINH" , SDTFPUnaryOp>; def fcosh : SDNode<"ISD::FCOSH" , SDTFPUnaryOp>; def ftanh : SDNode<"ISD::FTANH" , SDTFPUnaryOp>; @@ -602,6 +603,8 @@ def strict_facos : SDNode<"ISD::STRICT_FACOS", SDTFPUnaryOp, [SDNPHasChain]>; def strict_fatan : SDNode<"ISD::STRICT_FATAN", SDTFPUnaryOp, [SDNPHasChain]>; +def strict_fatan2 : SDNode<"ISD::STRICT_FATAN2", + SDTFPBinOp, [SDNPHasChain]>; def strict_fsinh : SDNode<"ISD::STRICT_FSINH", SDTFPUnaryOp, [SDNPHasChain]>; def strict_fcosh : SDNode<"ISD::STRICT_FCOSH", @@ -1588,6 +1591,9 @@ def any_facos : PatFrags<(ops node:$src), def any_fatan : PatFrags<(ops node:$src), [(strict_fatan node:$src), (fatan node:$src)]>; +def any_fatan2 : PatFrags<(ops node:$src1, node:$src2), + [(strict_fatan2 node:$src1, node:$src2), + (fatan2 node:$src1, node:$src2)]>; def any_fsinh : PatFrags<(ops node:$src), [(strict_fsinh node:$src), (fsinh node:$src)]>; diff --git a/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h b/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h index 49cae6dde47e54c0dc88f34d2c5468418c1eceea..88cbf88acc4cd0cecf7332324a361240b29eb1fa 100644 --- a/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h +++ b/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h @@ -25,6 +25,8 @@ namespace llvm { +class ModuleSlotTracker; + // Provides two way mapping between the blocks and numbers. class BlockToIndexMapping { SmallVector V; @@ -96,7 +98,8 @@ public: // Print order is in RPO void dump() const; void dump(StringRef Label, BitVector const &BV, - const ReversePostOrderTraversal &RPOT) const; + const ReversePostOrderTraversal &RPOT, + ModuleSlotTracker &MST) const; #endif SuspendCrossingInfo(Function &F, diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h index 076d91adfd1dea55d5abbf7fdef4aa33ca17fd74..4e757b299618178cebdadbcdb526bd8c77f7772d 100644 --- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h +++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h @@ -201,9 +201,7 @@ private: void UpdateWithSalvagedProfiles(); LocToLocMap &getIRToProfileLocationMap(const Function &F) { - auto Ret = FuncMappings.try_emplace( - FunctionSamples::getCanonicalFnName(F.getName()), LocToLocMap()); - return Ret.first->second; + return FuncMappings[FunctionSamples::getCanonicalFnName(F.getName())]; } void distributeIRToProfileLocationMap(); void distributeIRToProfileLocationMap(FunctionSamples &FS); diff --git a/llvm/include/llvm/Transforms/Utils/Instrumentation.h b/llvm/include/llvm/Transforms/Utils/Instrumentation.h index 1a4824a806dc6e022033b33561ce01cb08d5ef84..4f67d079d146963f5faf09072ad91476804a91a2 100644 --- a/llvm/include/llvm/Transforms/Utils/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Utils/Instrumentation.h @@ -161,6 +161,7 @@ struct SanitizerCoverageOptions { bool TraceLoads = false; bool TraceStores = false; bool CollectControlFlow = false; + bool GatedCallbacks = false; SanitizerCoverageOptions() = default; }; diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index 5697d983c9ad4801a54a2c959847c203c91e9530..7dd754a2bc0deff0f52d0fa37f30a1e4584d4aa2 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -126,6 +126,11 @@ class SCEVExpander : public SCEVVisitor { /// "expanded" form. bool LSRMode; + /// When true, rewrite any divisors of UDiv expressions that may be 0 to + /// umax(Divisor, 1) to avoid introducing UB. If the divisor may be poison, + /// freeze it first. + bool SafeUDivMode = false; + typedef IRBuilder BuilderType; BuilderType Builder; diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 95531544a1c8176b6137fb85083350646e8a8ea9..877c83291170bfe3ed68b32149161f551e224189 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -122,15 +122,13 @@ private: /// or a horizontal reduction was not matched or not possible. bool vectorizeHorReduction(PHINode *P, Instruction *Root, BasicBlock *BB, slpvectorizer::BoUpSLP &R, - TargetTransformInfo *TTI, SmallVectorImpl &PostponedInsts); /// Make an attempt to vectorize reduction and then try to vectorize /// postponed binary operations. /// \returns true on any successfull vectorization. bool vectorizeRootInstruction(PHINode *P, Instruction *Root, BasicBlock *BB, - slpvectorizer::BoUpSLP &R, - TargetTransformInfo *TTI); + slpvectorizer::BoUpSLP &R); /// Try to vectorize trees that start at insertvalue instructions. bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h index 02abdf0a1ef0df4ac5cedc259a9dfb725d8bef04..5cd47efd6b34620259b154c7352cbc0f90aa82d2 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h @@ -13,15 +13,15 @@ #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_BOTTOMUPVEC_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" #include "llvm/SandboxIR/Constant.h" #include "llvm/SandboxIR/Pass.h" #include "llvm/SandboxIR/PassManager.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h" namespace llvm::sandboxir { -class RegionPassManager; - class BottomUpVec final : public FunctionPass { bool Change = false; LegalityAnalysis Legality; @@ -32,8 +32,12 @@ class BottomUpVec final : public FunctionPass { RegionPassManager RPM; public: - BottomUpVec(); + BottomUpVec(StringRef Pipeline); bool runOnFunction(Function &F) final; + void printPipeline(raw_ostream &OS) const final { + OS << getName() << "\n"; + RPM.printPipeline(OS); + } }; } // namespace llvm::sandboxir diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h new file mode 100644 index 0000000000000000000000000000000000000000..9d88bc8280384796be3c36ce4cc88c785cb41a71 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h @@ -0,0 +1,23 @@ +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_PRINTINSTRUCTIONCOUNT_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_PRINTINSTRUCTIONCOUNT_H + +#include "llvm/SandboxIR/Pass.h" +#include "llvm/SandboxIR/Region.h" + +namespace llvm::sandboxir { + +/// A Region pass that prints the instruction count for the region to stdout. +/// Used to test -sbvec-passes while we don't have any actual optimization +/// passes. +class PrintInstructionCount final : public RegionPass { +public: + PrintInstructionCount() : RegionPass("null") {} + bool runOnRegion(Region &R) final { + outs() << "InstructionCount: " << std::distance(R.begin(), R.end()) << "\n"; + return false; + } +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_PRINTINSTRUCTIONCOUNTPASS_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h new file mode 100644 index 0000000000000000000000000000000000000000..3d82a61c90153aa5e702aae4b50a0d5932f6cbfd --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h @@ -0,0 +1,38 @@ +//===- RegionsFromMetadata.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A SandboxIR function pass that builds regions from IR metadata and then runs +// a pipeline of region passes on them. This is useful to test region passes in +// isolation without relying on the output of the bottom-up vectorizer. +// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_REGIONSFROMMETADATA_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_REGIONSFROMMETADATA_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/SandboxIR/Pass.h" +#include "llvm/SandboxIR/PassManager.h" + +namespace llvm::sandboxir { + +class RegionsFromMetadata final : public FunctionPass { + // The PM containing the pipeline of region passes. + RegionPassManager RPM; + +public: + RegionsFromMetadata(StringRef Pipeline); + bool runOnFunction(Function &F) final; + void printPipeline(raw_ostream &OS) const final { + OS << getName() << "\n"; + RPM.printPipeline(OS); + } +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_REGIONSFROMMETADATA_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h index b7cb418c00326aaade307f8b4707225c646e46cc..b83744cf9e6cb680ebd038b78f1f5cbcb0b2487f 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h @@ -11,7 +11,7 @@ #include #include "llvm/IR/PassManager.h" -#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" +#include "llvm/SandboxIR/PassManager.h" namespace llvm { @@ -20,8 +20,8 @@ class TargetTransformInfo; class SandboxVectorizerPass : public PassInfoMixin { TargetTransformInfo *TTI = nullptr; - // The main vectorizer pass. - sandboxir::BottomUpVec BottomUpVecPass; + // A pipeline of SandboxIR function passes run by the vectorizer. + sandboxir::FunctionPassManager FPM; bool runImpl(Function &F); diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h new file mode 100644 index 0000000000000000000000000000000000000000..e3d6ecae836fce9f5fef4cae35df0386f4977820 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h @@ -0,0 +1,32 @@ +//===- SandboxVectorizerPassBuilder.h ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Utility functions so passes with sub-pipelines can create SandboxVectorizer +// passes without replicating the same logic in each pass. +// +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZERPASSBUILDER_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZERPASSBUILDER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/SandboxIR/Pass.h" + +#include + +namespace llvm::sandboxir { + +class SandboxVectorizerPassBuilder { +public: + static std::unique_ptr createFunctionPass(StringRef Name, + StringRef Args); + static std::unique_ptr createRegionPass(StringRef Name, + StringRef Args); +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZERPASSBUILDER_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h index 6bad38b637d2a840ab958547f0b210521583a4ef..a4512862136a8b9decba027075f89313cf55ed06 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h @@ -279,6 +279,7 @@ public: unsigned size() const { return Bundles.size(); } #ifndef NDEBUG + void print(raw_ostream &OS) const; LLVM_DUMP_METHOD void dump() const; #endif // NDEBUG }; diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 3af4ae31ffe8b02802743ab861bd0ca2ccd7f456..da0fd1f07c83e47f300e20ced445a59a422d26ff 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -888,7 +888,8 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, APInt Offset = APInt( BitWidth, DL.getIndexedOffsetInType( - SrcElemTy, ArrayRef((Value *const *)Ops.data() + 1, Ops.size() - 1))); + SrcElemTy, ArrayRef((Value *const *)Ops.data() + 1, Ops.size() - 1)), + /*isSigned=*/true, /*implicitTrunc=*/true); std::optional InRange = GEP->getInRange(); if (InRange) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 53001421ce6f7acf1228907a15bb516dd1f3a531..76cde01782bb02c8ec8f44fa22d3a9f44a582ce3 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -690,13 +690,9 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind, return InstDesc(Kind == RecurKind::SMax, I); if (match(I, m_SMin(m_Value(), m_Value()))) return InstDesc(Kind == RecurKind::SMin, I); - if (match(I, m_OrdFMin(m_Value(), m_Value()))) + if (match(I, m_OrdOrUnordFMin(m_Value(), m_Value()))) return InstDesc(Kind == RecurKind::FMin, I); - if (match(I, m_OrdFMax(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMax, I); - if (match(I, m_UnordFMin(m_Value(), m_Value()))) - return InstDesc(Kind == RecurKind::FMin, I); - if (match(I, m_UnordFMax(m_Value(), m_Value()))) + if (match(I, m_OrdOrUnordFMax(m_Value(), m_Value()))) return InstDesc(Kind == RecurKind::FMax, I); if (match(I, m_Intrinsic(m_Value(), m_Value()))) return InstDesc(Kind == RecurKind::FMin, I); diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 30dc4ae30dbfa5281a78afeaad0e5d2ce8cafc8a..10ad4708596cb38174bf5fdca5163c5d23a2cd75 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1613,7 +1613,7 @@ LazyValueInfoImpl &LazyValueInfo::getOrCreateImpl(const Module *M) { assert(M && "getCache() called with a null Module"); const DataLayout &DL = M->getDataLayout(); Function *GuardDecl = - M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard)); + Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard); PImpl = new LazyValueInfoImpl(AC, DL, GuardDecl); } return *static_cast(PImpl); diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index f4b202791a708128092ba989bcb39f987d4a8bd7..820b8e96c1d3ae3061255baa22e87b9e44367c4b 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -95,10 +95,8 @@ static bool isDereferenceableAndAlignedPointer( auto IsKnownDeref = [&]() { bool CheckForNonNull, CheckForFreed; - APInt KnownDerefBytes(Size.getBitWidth(), - V->getPointerDereferenceableBytes(DL, CheckForNonNull, - CheckForFreed)); - if (!KnownDerefBytes.getBoolValue() || !KnownDerefBytes.uge(Size) || + if (!Size.ule(V->getPointerDereferenceableBytes(DL, CheckForNonNull, + CheckForFreed)) || CheckForFreed) return false; if (CheckForNonNull && diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index e1abf5e4d885eceee906496d444208a0ecbcb854..dc2dc4c1733b5e7a0a89d95eadc89bfff60babd5 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -767,6 +767,8 @@ SizeOffsetAPInt ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) { TypeSize ElemSize = DL.getTypeAllocSize(I.getAllocatedType()); if (ElemSize.isScalable() && Options.EvalMode != ObjectSizeOpts::Mode::Min) return ObjectSizeOffsetVisitor::unknown(); + if (!isUIntN(IntTyBits, ElemSize.getKnownMinValue())) + return ObjectSizeOffsetVisitor::unknown(); APInt Size(IntTyBits, ElemSize.getKnownMinValue()); if (!I.isArrayAllocation()) return SizeOffsetAPInt(align(Size, I.getAlign()), Zero); diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 97ea405a5267aebfdf821578205165763afbb9b7..58e23e9556f1447a0e6b52b6a75d2a0d013ead78 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6883,7 +6883,7 @@ const ConstantRange &ScalarEvolution::getRangeRef( bool CanBeNull, CanBeFreed; uint64_t DerefBytes = V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); - if (DerefBytes > 1) { + if (DerefBytes > 1 && isUIntN(BitWidth, DerefBytes)) { // The highest address the object can start is DerefBytes bytes before // the end (unsigned max value). If this value is not a multiple of the // alignment, the last possible start value is the next lowest multiple @@ -9662,14 +9662,14 @@ Constant * ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, const APInt &BEs, const Loop *L) { - auto I = ConstantEvolutionLoopExitValue.find(PN); - if (I != ConstantEvolutionLoopExitValue.end()) + auto [I, Inserted] = ConstantEvolutionLoopExitValue.try_emplace(PN); + if (!Inserted) return I->second; if (BEs.ugt(MaxBruteForceIterations)) - return ConstantEvolutionLoopExitValue[PN] = nullptr; // Not going to evaluate it. + return nullptr; // Not going to evaluate it. - Constant *&RetVal = ConstantEvolutionLoopExitValue[PN]; + Constant *&RetVal = I->second; DenseMap CurrentIterVals; BasicBlock *Header = L->getHeader(); @@ -11665,8 +11665,8 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, } // Check conditions due to any @llvm.experimental.guard intrinsics. - auto *GuardDecl = F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::experimental_guard); if (GuardDecl) for (const auto *GU : GuardDecl->users()) if (const auto *Guard = dyn_cast(GU)) @@ -13615,8 +13615,8 @@ ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI, // ScalarEvolution to optimize based on those guards. For now we prefer to be // efficient in lieu of being smart in that rather obscure case. - auto *GuardDecl = F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::experimental_guard); HasGuards = GuardDecl && !GuardDecl->use_empty(); } @@ -15593,8 +15593,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { } // Second, collect information from llvm.experimental.guards dominating the loop. - auto *GuardDecl = SE.F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + SE.F.getParent(), Intrinsic::experimental_guard); if (GuardDecl) for (const auto *GU : GuardDecl->users()) if (const auto *Guard = dyn_cast(GU)) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0a7679639dabdfc795d65153ddcf536426b44d2b..4397a569c948b7a9c9b4da3e540b41abd6941147 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -8256,9 +8256,7 @@ static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred, case CmpInst::FCMP_OLE: case CmpInst::FCMP_ULT: case CmpInst::FCMP_ULE: - if (match(FalseVal, - m_CombineOr(m_OrdFMin(m_Specific(CmpLHS), m_APFloat(FC2)), - m_UnordFMin(m_Specific(CmpLHS), m_APFloat(FC2)))) && + if (match(FalseVal, m_OrdOrUnordFMin(m_Specific(CmpLHS), m_APFloat(FC2))) && *FC1 < *FC2) return {SPF_FMAXNUM, SPNB_RETURNS_ANY, false}; break; @@ -8266,9 +8264,7 @@ static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred, case CmpInst::FCMP_OGE: case CmpInst::FCMP_UGT: case CmpInst::FCMP_UGE: - if (match(FalseVal, - m_CombineOr(m_OrdFMax(m_Specific(CmpLHS), m_APFloat(FC2)), - m_UnordFMax(m_Specific(CmpLHS), m_APFloat(FC2)))) && + if (match(FalseVal, m_OrdOrUnordFMax(m_Specific(CmpLHS), m_APFloat(FC2))) && *FC1 > *FC2) return {SPF_FMINNUM, SPNB_RETURNS_ANY, false}; break; @@ -8534,6 +8530,10 @@ bool llvm::isKnownInversion(const Value *X, const Value *Y) { !match(Y, m_c_ICmp(Pred2, m_Specific(A), m_Value(C)))) return false; + // They must both have samesign flag or not. + if (cast(X)->hasSameSign() != cast(Y)->hasSameSign()) + return false; + if (B == C) return Pred1 == ICmpInst::getInversePredicate(Pred2); @@ -8542,6 +8542,11 @@ bool llvm::isKnownInversion(const Value *X, const Value *Y) { if (!match(B, m_APInt(RHSC1)) || !match(C, m_APInt(RHSC2))) return false; + // Sign bits of two RHSCs should match. + if (cast(X)->hasSameSign() && + RHSC1->isNonNegative() != RHSC2->isNonNegative()) + return false; + const auto CR1 = ConstantRange::makeExactICmpRegion(Pred1, *RHSC1); const auto CR2 = ConstantRange::makeExactICmpRegion(Pred2, *RHSC2); diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 9b18fba8e37c433991b992c0819042a243029812..5257153d26e704f0d955f54e6df47f228e825218 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1425,7 +1425,7 @@ void InterleavedAccessInfo::analyzeInterleaving( auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup *Group, int Index, - std::string FirstOrLast) -> bool { + const char *FirstOrLast) -> bool { Instruction *Member = Group->getMember(Index); assert(Member && "Group member does not exist"); Value *MemberPtr = getLoadStorePointerOperand(Member); @@ -1466,11 +1466,10 @@ void InterleavedAccessInfo::analyzeInterleaving( // So we check only group member 0 (which is always guaranteed to exist), // and group member Factor - 1; If the latter doesn't exist we rely on // peeling (if it is a non-reversed access -- see Case 3). - if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first"))) + if (InvalidateGroupIfMemberMayWrap(Group, 0, "first")) continue; if (Group->getMember(Group->getFactor() - 1)) - InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, - std::string("last")); + InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, "last"); else { // Case 3: A non-reversed interleaved load group with gaps: We need // to execute at least one scalar epilogue iteration. This will ensure @@ -1514,11 +1513,11 @@ void InterleavedAccessInfo::analyzeInterleaving( // and the last group member. Case 3 (scalar epilog) is not relevant for // stores with gaps, which are implemented with masked-store (rather than // speculative access, as in loads). - if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first"))) + if (InvalidateGroupIfMemberMayWrap(Group, 0, "first")) continue; for (int Index = Group->getFactor() - 1; Index > 0; Index--) if (Group->getMember(Index)) { - InvalidateGroupIfMemberMayWrap(Group, Index, std::string("last")); + InvalidateGroupIfMemberMayWrap(Group, Index, "last"); break; } } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index e09532dc4735fb8cf7fb75182ac227260cccfe86..5a6fb5064b31668ae70bffdf8d0a499125a8042f 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -876,7 +876,8 @@ private: } else { int64_t Start = BitcodeReader::decodeSignRotatedValue(Record[OpNum++]); int64_t End = BitcodeReader::decodeSignRotatedValue(Record[OpNum++]); - return ConstantRange(APInt(BitWidth, Start), APInt(BitWidth, End)); + return ConstantRange(APInt(BitWidth, Start, true), + APInt(BitWidth, End, true)); } } @@ -7039,11 +7040,12 @@ Error BitcodeReader::materialize(GlobalValue *GV) { // Remove incompatible attributes on function calls. if (auto *CI = dyn_cast(&I)) { CI->removeRetAttrs(AttributeFuncs::typeIncompatible( - CI->getFunctionType()->getReturnType())); + CI->getFunctionType()->getReturnType(), CI->getRetAttributes())); for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ++ArgNo) CI->removeParamAttrs(ArgNo, AttributeFuncs::typeIncompatible( - CI->getArgOperand(ArgNo)->getType())); + CI->getArgOperand(ArgNo)->getType(), + CI->getParamAttributes(ArgNo))); } } diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 48cc21ee20f0af5990472a011132937dd4252eff..2d7f351de54ecb1cf30fc9abcf123f451295feac 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -35,7 +35,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeDebugifyMachineModulePass(Registry); initializeDetectDeadLanesPass(Registry); initializeDwarfEHPrepareLegacyPassPass(Registry); - initializeEarlyIfConverterPass(Registry); + initializeEarlyIfConverterLegacyPass(Registry); initializeEarlyIfPredicatorPass(Registry); initializeEarlyMachineLICMPass(Registry); initializeEarlyTailDuplicatePass(Registry); diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index 8d9813edd7e52acca32eaa17c15901ff72848c9b..3e73995846176e6da83ce2548c2460700a8d4422 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -15,6 +15,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/EarlyIfConversion.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" @@ -760,7 +761,7 @@ void SSAIfConv::convertIf(SmallVectorImpl &RemoveBlocks, //===----------------------------------------------------------------------===// namespace { -class EarlyIfConverter : public MachineFunctionPass { +class EarlyIfConverter { const TargetInstrInfo *TII = nullptr; const TargetRegisterInfo *TRI = nullptr; MCSchedModel SchedModel; @@ -772,38 +773,48 @@ class EarlyIfConverter : public MachineFunctionPass { SSAIfConv IfConv; public: - static char ID; - EarlyIfConverter() : MachineFunctionPass(ID) {} - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override { return "Early If-Conversion"; } + EarlyIfConverter(MachineDominatorTree &DT, MachineLoopInfo &LI, + MachineTraceMetrics &MTM) + : DomTree(&DT), Loops(&LI), Traces(&MTM) {} + EarlyIfConverter() = delete; + + bool run(MachineFunction &MF); private: bool tryConvertIf(MachineBasicBlock *); void invalidateTraces(); bool shouldConvertIf(); }; + +class EarlyIfConverterLegacy : public MachineFunctionPass { +public: + static char ID; + EarlyIfConverterLegacy() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return "Early If-Conversion"; } +}; } // end anonymous namespace -char EarlyIfConverter::ID = 0; -char &llvm::EarlyIfConverterID = EarlyIfConverter::ID; +char EarlyIfConverterLegacy::ID = 0; +char &llvm::EarlyIfConverterLegacyID = EarlyIfConverterLegacy::ID; -INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE, - "Early If Converter", false, false) +INITIALIZE_PASS_BEGIN(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter", + false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE, - "Early If Converter", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass) +INITIALIZE_PASS_END(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter", + false, false) -void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const { +void EarlyIfConverterLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -1076,11 +1087,9 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) { return Changed; } -bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) { +bool EarlyIfConverter::run(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n" << "********** Function: " << MF.getName() << '\n'); - if (skipFunction(MF.getFunction())) - return false; // Only run if conversion if the target wants it. const TargetSubtargetInfo &STI = MF.getSubtarget(); @@ -1091,9 +1100,6 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) { TRI = STI.getRegisterInfo(); SchedModel = STI.getSchedModel(); MRI = &MF.getRegInfo(); - DomTree = &getAnalysis().getDomTree(); - Loops = &getAnalysis().getLI(); - Traces = &getAnalysis(); MinInstr = nullptr; bool Changed = false; @@ -1110,6 +1116,41 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) { return Changed; } +PreservedAnalyses +EarlyIfConverterPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (MF.getFunction().hasOptNone()) + return PreservedAnalyses::all(); + + MachineDominatorTree &MDT = MFAM.getResult(MF); + MachineLoopInfo &LI = MFAM.getResult(MF); + MachineTraceMetrics &MTM = MFAM.getResult(MF); + + EarlyIfConverter Impl(MDT, LI, MTM); + bool Changed = Impl.run(MF); + if (!Changed) + return PreservedAnalyses::all(); + + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; +} + +bool EarlyIfConverterLegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MachineDominatorTree &MDT = + getAnalysis().getDomTree(); + MachineLoopInfo &LI = getAnalysis().getLI(); + MachineTraceMetrics &MTM = + getAnalysis().getMTM(); + + return EarlyIfConverter(MDT, LI, MTM).run(MF); +} + //===----------------------------------------------------------------------===// // EarlyIfPredicator Pass //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 32ba3e91822ddb8981a6ab84f0a8c61fdf8c284a..dd18b524e3f9c157b56f74f691d2d31a0b1a9049 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -235,13 +235,12 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder, // TODO add caching // Scalable vector %evl conversion. if (ElemCount.isScalable()) { - auto *M = Builder.GetInsertBlock()->getModule(); Type *BoolVecTy = VectorType::get(Builder.getInt1Ty(), ElemCount); - Function *ActiveMaskFunc = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::get_active_lane_mask, {BoolVecTy, EVLParam->getType()}); // `get_active_lane_mask` performs an implicit less-than comparison. Value *ConstZero = Builder.getInt32(0); - return Builder.CreateCall(ActiveMaskFunc, {ConstZero, EVLParam}); + return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {BoolVecTy, EVLParam->getType()}, + {ConstZero, EVLParam}); } // Fixed vector %evl conversion. @@ -299,18 +298,18 @@ Value *CachingVPExpander::expandPredicationToIntCall( case Intrinsic::umin: { Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); - Function *Fn = Intrinsic::getOrInsertDeclaration( - VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); - Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName()); + Value *NewOp = Builder.CreateIntrinsic( + UnpredicatedIntrinsicID, {VPI.getType()}, {Op0, Op1}, + /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); return NewOp; } case Intrinsic::bswap: case Intrinsic::bitreverse: { Value *Op = VPI.getOperand(0); - Function *Fn = Intrinsic::getOrInsertDeclaration( - VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); - Value *NewOp = Builder.CreateCall(Fn, {Op}, VPI.getName()); + Value *NewOp = + Builder.CreateIntrinsic(UnpredicatedIntrinsicID, {VPI.getType()}, {Op}, + /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); return NewOp; } @@ -327,9 +326,9 @@ Value *CachingVPExpander::expandPredicationToFPCall( case Intrinsic::fabs: case Intrinsic::sqrt: { Value *Op0 = VPI.getOperand(0); - Function *Fn = Intrinsic::getOrInsertDeclaration( - VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); - Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName()); + Value *NewOp = + Builder.CreateIntrinsic(UnpredicatedIntrinsicID, {VPI.getType()}, {Op0}, + /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); return NewOp; } @@ -337,9 +336,9 @@ Value *CachingVPExpander::expandPredicationToFPCall( case Intrinsic::minnum: { Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); - Function *Fn = Intrinsic::getOrInsertDeclaration( - VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); - Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName()); + Value *NewOp = Builder.CreateIntrinsic( + UnpredicatedIntrinsicID, {VPI.getType()}, {Op0, Op1}, + /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); return NewOp; } @@ -592,12 +591,10 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { Type *Int32Ty = Type::getInt32Ty(VPI.getContext()); if (StaticElemCount.isScalable()) { // TODO add caching - auto *M = VPI.getModule(); - Function *VScaleFunc = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, Int32Ty); IRBuilder<> Builder(VPI.getParent(), VPI.getIterator()); Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue()); - Value *VScale = Builder.CreateCall(VScaleFunc, {}, "vscale"); + Value *VScale = Builder.CreateIntrinsic(Intrinsic::vscale, Int32Ty, {}, + /*FMFSource=*/nullptr, "vscale"); MaxEVL = Builder.CreateMul(VScale, FactorConst, "scalable_size", /*NUW*/ true, /*NSW*/ false); } else { diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 14e94d48bf8362352a47a8c3b9674154df964de6..f9b1621955c21731593df50466e04c1a25185fb9 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -178,7 +178,7 @@ void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, if (MRI.constrainRegAttrs(ToReg, FromReg)) MRI.replaceRegWith(FromReg, ToReg); else - Builder.buildCopy(ToReg, FromReg); + Builder.buildCopy(FromReg, ToReg); Observer.finishedChangingAllUsesOfReg(); } @@ -229,8 +229,8 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) { void CombinerHelper::applyCombineCopy(MachineInstr &MI) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - MI.eraseFromParent(); replaceRegWith(MRI, DstReg, SrcReg); + MI.eraseFromParent(); } bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand( @@ -379,8 +379,8 @@ void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI, Builder.buildUndef(NewDstReg); else Builder.buildBuildVector(NewDstReg, Ops); - MI.eraseFromParent(); replaceRegWith(MRI, DstReg, NewDstReg); + MI.eraseFromParent(); } bool CombinerHelper::matchCombineShuffleConcat(MachineInstr &MI, @@ -559,8 +559,8 @@ void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI, else Builder.buildMergeLikeInstr(NewDstReg, Ops); - MI.eraseFromParent(); replaceRegWith(MRI, DstReg, NewDstReg); + MI.eraseFromParent(); } bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) { @@ -2825,8 +2825,8 @@ void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI, Register OldReg = MI.getOperand(0).getReg(); Register Replacement = MI.getOperand(OpIdx).getReg(); assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?"); - MI.eraseFromParent(); replaceRegWith(MRI, OldReg, Replacement); + MI.eraseFromParent(); } void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI, @@ -2834,8 +2834,8 @@ void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI, assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?"); Register OldReg = MI.getOperand(0).getReg(); assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?"); - MI.eraseFromParent(); replaceRegWith(MRI, OldReg, Replacement); + MI.eraseFromParent(); } bool CombinerHelper::matchConstantLargerBitWidth(MachineInstr &MI, diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index d0464670b292a4734e932048ea0ae50658594bbe..563a82644134528f717c2fa165cbb60080dd160c 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2378,7 +2378,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, case Intrinsic::stackprotector: { LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL); Register GuardVal; - if (TLI->useLoadStackGuardNode()) { + if (TLI->useLoadStackGuardNode(*CI.getModule())) { GuardVal = MRI->createGenericVirtualRegister(PtrTy); getStackGuard(GuardVal, MIRBuilder); } else @@ -3869,7 +3869,7 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD, // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD. // Otherwise, emit a volatile load to retrieve the stack guard value. - if (TLI->useLoadStackGuardNode()) { + if (TLI->useLoadStackGuardNode(*ParentBB->getBasicBlock()->getModule())) { Guard = MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits())); getStackGuard(Guard, *CurBuilder); diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp index c8a63304a3b63b9abbc5154b8da3ae8386a057e6..86fec239c3edb2cd0fa9022435f22457b1cc6652 100644 --- a/llvm/lib/CodeGen/HardwareLoops.cpp +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -512,8 +512,7 @@ Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { : Intrinsic::test_set_loop_iterations) : (UsePhi ? Intrinsic::start_loop_iterations : Intrinsic::set_loop_iterations); - Function *LoopIter = Intrinsic::getOrInsertDeclaration(M, ID, Ty); - Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit); + Value *LoopSetup = Builder.CreateIntrinsic(ID, Ty, LoopCountInit); // Use the return value of the intrinsic to control the entry of the loop. if (UseLoopGuard) { @@ -541,10 +540,9 @@ void HardwareLoop::InsertLoopDec() { Attribute::StrictFP)) CondBuilder.setIsFPConstrained(true); - Function *DecFunc = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::loop_decrement, LoopDecrement->getType()); Value *Ops[] = { LoopDecrement }; - Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops); + Value *NewCond = CondBuilder.CreateIntrinsic(Intrinsic::loop_decrement, + LoopDecrement->getType(), Ops); Value *OldCond = ExitBranch->getCondition(); ExitBranch->setCondition(NewCond); @@ -565,10 +563,9 @@ Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) { Attribute::StrictFP)) CondBuilder.setIsFPConstrained(true); - Function *DecFunc = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::loop_decrement_reg, {EltsRem->getType()}); Value *Ops[] = { EltsRem, LoopDecrement }; - Value *Call = CondBuilder.CreateCall(DecFunc, Ops); + Value *Call = CondBuilder.CreateIntrinsic(Intrinsic::loop_decrement_reg, + {EltsRem->getType()}, Ops); LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n"); return cast(Call); diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 10d3cdcf0c1ce10b5bb31c03b6ecb11f23390888..c0c61b3fdd1677156094f75f29f3252a57f8fbae 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -703,7 +703,7 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS, return error(FlagStringValue.SourceRange.Start, Twine("use of undefined register flag '") + FlagStringValue.Value + "'"); - Info.Flags.push_back(FlagValue); + Info.Flags |= FlagValue; } RegInfo.noteNewVirtualRegister(Info.VReg); } diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index 1a19e053d30fe1df5584175786681c9c48711fa0..5bfc1d63ac3764030710dc3e4e1b07ecb3134174 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -133,7 +133,7 @@ char &llvm::MachineCombinerID = MachineCombiner::ID; INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass) INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) @@ -142,8 +142,8 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -727,7 +727,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { TSchedModel.init(STI); MRI = &MF.getRegInfo(); MLI = &getAnalysis().getLI(); - Traces = &getAnalysis(); + Traces = &getAnalysis().getMTM(); PSI = &getAnalysis().getPSI(); MBFI = (PSI && PSI->hasProfileSummary()) ? &getAnalysis().getBFI() : diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp index bf3add010574b81de4b64bfd58316f4851f036f2..92df6b9ab48d7c0630a87a7e620f290f62a32493 100644 --- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -14,7 +14,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SparseSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -40,49 +39,66 @@ using namespace llvm; #define DEBUG_TYPE "machine-trace-metrics" -char MachineTraceMetrics::ID = 0; +AnalysisKey MachineTraceMetricsAnalysis::Key; -char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID; +MachineTraceMetricsAnalysis::Result +MachineTraceMetricsAnalysis::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + return Result(MF, MFAM.getResult(MF)); +} + +PreservedAnalyses +MachineTraceMetricsVerifierPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + MFAM.getResult(MF).verifyAnalysis(); + return PreservedAnalyses::all(); +} -INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, +char MachineTraceMetricsWrapperPass::ID = 0; + +char &llvm::MachineTraceMetricsID = MachineTraceMetricsWrapperPass::ID; + +INITIALIZE_PASS_BEGIN(MachineTraceMetricsWrapperPass, DEBUG_TYPE, "Machine Trace Metrics", false, true) -INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) -INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE, +INITIALIZE_PASS_END(MachineTraceMetricsWrapperPass, DEBUG_TYPE, "Machine Trace Metrics", false, true) -MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) { - std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr); -} +MachineTraceMetricsWrapperPass::MachineTraceMetricsWrapperPass() + : MachineFunctionPass(ID) {} -void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const { +void MachineTraceMetricsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } -bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) { +void MachineTraceMetrics::init(MachineFunction &Func, + const MachineLoopInfo &LI) { MF = &Func; const TargetSubtargetInfo &ST = MF->getSubtarget(); TII = ST.getInstrInfo(); TRI = ST.getRegisterInfo(); MRI = &MF->getRegInfo(); - Loops = &getAnalysis().getLI(); + Loops = &LI; SchedModel.init(&ST); BlockInfo.resize(MF->getNumBlockIDs()); ProcReleaseAtCycles.resize(MF->getNumBlockIDs() * SchedModel.getNumProcResourceKinds()); +} + +bool MachineTraceMetricsWrapperPass::runOnMachineFunction(MachineFunction &MF) { + MTM.init(MF, getAnalysis().getLI()); return false; } -void MachineTraceMetrics::releaseMemory() { +MachineTraceMetrics::~MachineTraceMetrics() { clear(); } + +void MachineTraceMetrics::clear() { MF = nullptr; BlockInfo.clear(); - for (Ensemble *&E : Ensembles) { - delete E; - E = nullptr; - } + for (auto &E : Ensembles) + E.reset(); } //===----------------------------------------------------------------------===// @@ -398,35 +414,50 @@ MachineTraceMetrics::Ensemble * MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) { assert(strategy < MachineTraceStrategy::TS_NumStrategies && "Invalid trace strategy enum"); - Ensemble *&E = Ensembles[static_cast(strategy)]; + std::unique_ptr &E = + Ensembles[static_cast(strategy)]; if (E) - return E; + return E.get(); // Allocate new Ensemble on demand. switch (strategy) { case MachineTraceStrategy::TS_MinInstrCount: - return (E = new MinInstrCountEnsemble(this)); + E = std::make_unique(MinInstrCountEnsemble(this)); + break; case MachineTraceStrategy::TS_Local: - return (E = new LocalEnsemble(this)); + E = std::make_unique(LocalEnsemble(this)); + break; default: llvm_unreachable("Invalid trace strategy enum"); } + return E.get(); } void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB) << '\n'); BlockInfo[MBB->getNumber()].invalidate(); - for (Ensemble *E : Ensembles) + for (auto &E : Ensembles) if (E) E->invalidate(MBB); } +bool MachineTraceMetrics::invalidate( + MachineFunction &, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &) { + // Check whether the analysis, all analyses on machine functions, or the + // machine function's CFG have been preserved. + auto PAC = PA.getChecker(); + return !PAC.preserved() && + !PAC.preservedSet>() && + !PAC.preservedSet(); +} + void MachineTraceMetrics::verifyAnalysis() const { if (!MF) return; #ifndef NDEBUG assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size"); - for (Ensemble *E : Ensembles) + for (auto &E : Ensembles) if (E) E->verify(); #endif diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index a50909af8bfcfb8a8157447db021b650ce004c18..ad2037a2c20b5529f1ded311654abed5d295e854 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -368,8 +368,7 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) { if (!StackGuardVar) { TL.insertSSPDeclarations(*M); - return IRB.CreateCall( - Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard)); + return IRB.CreateIntrinsic(Intrinsic::stackguard, {}, {}); } return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard"); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a709b563601a909a4f8611bd9216d9b0e970e316..c8b466498cd10c2bf6d6625d38f0678a6067d002 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9611,6 +9611,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc + // fold (not (and x, y)) -> (or (not x), (not y)) iff x or y are setcc if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() && (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1); @@ -14964,14 +14965,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { EVT ExtVT = cast(N1)->getVT(); unsigned VTBits = VT.getScalarSizeInBits(); unsigned ExtVTBits = ExtVT.getScalarSizeInBits(); + SDLoc DL(N); // sext_vector_inreg(undef) = 0 because the top bit will all be the same. if (N0.isUndef()) - return DAG.getConstant(0, SDLoc(N), VT); + return DAG.getConstant(0, DL, VT); // fold (sext_in_reg c1) -> c1 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) - return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N0, N1); // If the input is already sign extended, just drop the extension. if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0)) @@ -14980,8 +14982,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG && ExtVT.bitsLT(cast(N0.getOperand(1))->getVT())) - return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), - N1); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N0.getOperand(0), N1); // fold (sext_in_reg (sext x)) -> (sext x) // fold (sext_in_reg (aext x)) -> (sext x) @@ -14993,7 +14994,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { if ((N00Bits <= ExtVTBits || DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits) && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N00); } // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) @@ -15011,7 +15012,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits))) && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))) - return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00); + return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, N00); } // fold (sext_in_reg (zext x)) -> (sext x) @@ -15020,12 +15021,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getScalarValueSizeInBits() == ExtVTBits && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N00); } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1))) - return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT); + return DAG.getZeroExtendInReg(N0, DL, ExtVT); // fold operands of sext_in_reg based on knowledge that the top bits are not // demanded. @@ -15047,7 +15048,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // extended enough. unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits) - return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), + return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), N0.getOperand(1)); } } @@ -15056,37 +15057,33 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // If sextload is not supported by target, we can only do the combine when // load has one use. Doing otherwise can block folding the extload with other // extends that the target does support. - if (ISD::isEXTLoad(N0.getNode()) && - ISD::isUNINDEXEDLoad(N0.getNode()) && + if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && ExtVT == cast(N0)->getMemoryVT() && ((!LegalOperations && cast(N0)->isSimple() && N0.hasOneUse()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { - LoadSDNode *LN0 = cast(N0); - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), - LN0->getBasePtr(), ExtVT, - LN0->getMemOperand()); + auto *LN0 = cast(N0); + SDValue ExtLoad = + DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), + LN0->getBasePtr(), ExtVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); AddToWorklist(ExtLoad.getNode()); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && - N0.hasOneUse() && - ExtVT == cast(N0)->getMemoryVT() && + N0.hasOneUse() && ExtVT == cast(N0)->getMemoryVT() && ((!LegalOperations && cast(N0)->isSimple()) && TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { - LoadSDNode *LN0 = cast(N0); - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), - LN0->getBasePtr(), ExtVT, - LN0->getMemOperand()); + auto *LN0 = cast(N0); + SDValue ExtLoad = + DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), + LN0->getBasePtr(), ExtVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (sext_inreg (masked_load x)) -> (sext_masked_load x) @@ -15096,7 +15093,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD && TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) { SDValue ExtMaskedLoad = DAG.getMaskedLoad( - VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), + VT, DL, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(), Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad()); CombineTo(N, ExtMaskedLoad); @@ -15107,15 +15104,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x) if (auto *GN0 = dyn_cast(N0)) { - if (SDValue(GN0, 0).hasOneUse() && - ExtVT == GN0->getMemoryVT() && + if (SDValue(GN0, 0).hasOneUse() && ExtVT == GN0->getMemoryVT() && TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; SDValue ExtLoad = DAG.getMaskedGather( - DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops, - GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD); + DAG.getVTList(VT, MVT::Other), ExtVT, DL, Ops, GN0->getMemOperand(), + GN0->getIndexType(), ISD::SEXTLOAD); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); @@ -15128,7 +15124,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), N0.getOperand(1), false)) - return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, BSwap, N1); } // Fold (iM_signext_inreg @@ -15145,8 +15141,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, InnerExtVT))) { SDValue SignExtExtendee = - DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), InnerExtVT, Extendee); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, SignExtExtendee, + DAG.getNode(ISD::SIGN_EXTEND, DL, InnerExtVT, Extendee); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SignExtExtendee, N0.getOperand(1)); } } @@ -18116,7 +18112,7 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { return SDValue(); } -static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, +static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const TargetLowering &TLI) { // We only do this if the target has legal ftrunc. Otherwise, we'd likely be // replacing casts with a libcall. We also must be allowed to ignore -0.0 @@ -18134,11 +18130,11 @@ static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && N0.getOperand(0).getValueType() == VT) - return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); + return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && N0.getOperand(0).getValueType() == VT) - return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); + return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); return SDValue(); } @@ -18147,17 +18143,17 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); + SDLoc DL(N); // [us]itofp(undef) = 0, because the result value is bounded. if (N0.isUndef()) - return DAG.getConstantFP(0.0, SDLoc(N), VT); + return DAG.getConstantFP(0.0, DL, VT); // fold (sint_to_fp c1) -> c1fp - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - // ...but only if the target supports immediate floating-point values - (!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) - return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); + // ...but only if the target supports immediate floating-point values + if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::SINT_TO_FP, DL, VT, {N0})) + return C; // If the input is a legal type, and SINT_TO_FP is not legal on this target, // but UINT_TO_FP is legal on this target, try to convert. @@ -18165,31 +18161,27 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { hasOperation(ISD::UINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to UINT_TO_FP. if (DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); + return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0); } // The next optimizations are desirable only if SELECT_CC can be lowered. // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0) if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && !VT.isVector() && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { - SDLoc DL(N); + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT)); - } // fold (sint_to_fp (zext (setcc x, y, cc))) -> // (select (setcc x, y, cc), 1.0, 0.0) if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { - SDLoc DL(N); + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getSelect(DL, VT, N0.getOperand(0), DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT)); - } - if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) + if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI)) return FTrunc; return SDValue(); @@ -18199,17 +18191,17 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); + SDLoc DL(N); // [us]itofp(undef) = 0, because the result value is bounded. if (N0.isUndef()) - return DAG.getConstantFP(0.0, SDLoc(N), VT); + return DAG.getConstantFP(0.0, DL, VT); // fold (uint_to_fp c1) -> c1fp - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - // ...but only if the target supports immediate floating-point values - (!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) - return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); + // ...but only if the target supports immediate floating-point values + if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::UINT_TO_FP, DL, VT, {N0})) + return C; // If the input is a legal type, and UINT_TO_FP is not legal on this target, // but SINT_TO_FP is legal on this target, try to convert. @@ -18217,25 +18209,23 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { hasOperation(ISD::SINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to SINT_TO_FP. if (DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); + return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0); } // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0) if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { - SDLoc DL(N); + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT)); - } - if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) + if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI)) return FTrunc; return SDValue(); } // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x -static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { +static SDValue FoldIntToFPToInt(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -18265,12 +18255,12 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { // represented exactly in the float range. if (APFloat::semanticsPrecision(Sem) >= ActualSize) { if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) { - unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND; - return DAG.getNode(ExtOp, SDLoc(N), VT, Src); + unsigned ExtOp = + IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + return DAG.getNode(ExtOp, DL, VT, Src); } if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits()) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Src); return DAG.getBitcast(VT, Src); } return SDValue(); @@ -18279,31 +18269,33 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); // fold (fp_to_sint undef) -> undef if (N0.isUndef()) return DAG.getUNDEF(VT); // fold (fp_to_sint c1fp) -> c1 - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FP_TO_SINT, DL, VT, {N0})) + return C; - return FoldIntToFPToInt(N, DAG); + return FoldIntToFPToInt(N, DL, DAG); } SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); // fold (fp_to_uint undef) -> undef if (N0.isUndef()) return DAG.getUNDEF(VT); // fold (fp_to_uint c1fp) -> c1 - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FP_TO_UINT, DL, VT, {N0})) + return C; - return FoldIntToFPToInt(N, DAG); + return FoldIntToFPToInt(N, DL, DAG); } SDValue DAGCombiner::visitXROUND(SDNode *N) { @@ -18452,8 +18444,8 @@ SDValue DAGCombiner::visitFCEIL(SDNode *N) { EVT VT = N->getValueType(0); // fold (fceil c1) -> fceil(c1) - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCEIL, SDLoc(N), VT, {N0})) + return C; return SDValue(); } @@ -18463,8 +18455,8 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) { EVT VT = N->getValueType(0); // fold (ftrunc c1) -> ftrunc(c1) - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FTRUNC, SDLoc(N), VT, {N0})) + return C; // fold ftrunc (known rounded int x) -> x // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is @@ -18497,8 +18489,8 @@ SDValue DAGCombiner::visitFFLOOR(SDNode *N) { EVT VT = N->getValueType(0); // fold (ffloor c1) -> ffloor(c1) - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FFLOOR, SDLoc(N), VT, {N0})) + return C; return SDValue(); } @@ -18509,8 +18501,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { SelectionDAG::FlagInserter FlagsInserter(DAG, N); // Constant fold FNEG. - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FNEG, SDLoc(N), VT, {N0})) + return C; if (SDValue NegN0 = TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize)) @@ -22724,7 +22716,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, return SDValue(); ISD::LoadExtType ExtTy = - ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD; + ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD; if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index ea22b4670d6f1f7f0e76c47f61d12bd4cc24a061..e0a03383358b76a4bbead72fa037862e107aff43 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4600,6 +4600,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { ExpandFPLibCall(Node, RTLIB::ATAN_F32, RTLIB::ATAN_F64, RTLIB::ATAN_F80, RTLIB::ATAN_F128, RTLIB::ATAN_PPCF128, Results); break; + case ISD::FATAN2: + case ISD::STRICT_FATAN2: + ExpandFPLibCall(Node, RTLIB::ATAN2_F32, RTLIB::ATAN2_F64, RTLIB::ATAN2_F80, + RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128, Results); + break; case ISD::FSINH: case ISD::STRICT_FSINH: ExpandFPLibCall(Node, RTLIB::SINH_F32, RTLIB::SINH_F64, RTLIB::SINH_F80, @@ -5486,6 +5491,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::FMINIMUMNUM: case ISD::FMAXIMUMNUM: case ISD::FPOW: + case ISD::FATAN2: Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1)); Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, @@ -5502,6 +5508,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::STRICT_FMAXNUM: case ISD::STRICT_FREM: case ISD::STRICT_FPOW: + case ISD::STRICT_FATAN2: Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other}, {Node->getOperand(0), Node->getOperand(1)}); Tmp2 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other}, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 2c81c829e75cbbb69d5ac7d4c4b81f7446c5192b..73c258f0f6f18c297a3a185613831e58e10f56bc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -84,6 +84,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FASIN: R = SoftenFloatRes_FASIN(N); break; case ISD::STRICT_FATAN: case ISD::FATAN: R = SoftenFloatRes_FATAN(N); break; + case ISD::STRICT_FATAN2: + case ISD::FATAN2: R = SoftenFloatRes_FATAN2(N); break; case ISD::FCBRT: R = SoftenFloatRes_FCBRT(N); break; case ISD::STRICT_FCEIL: case ISD::FCEIL: R = SoftenFloatRes_FCEIL(N); break; @@ -366,6 +368,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FATAN(SDNode *N) { RTLIB::ATAN_F80, RTLIB::ATAN_F128, RTLIB::ATAN_PPCF128)); } +SDValue DAGTypeLegalizer::SoftenFloatRes_FATAN2(SDNode *N) { + return SoftenFloatRes_Binary( + N, + GetFPLibCall(N->getValueType(0), RTLIB::ATAN2_F32, RTLIB::ATAN2_F64, + RTLIB::ATAN2_F80, RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128)); +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FCBRT(SDNode *N) { return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::CBRT_F32, @@ -1430,6 +1439,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::FASIN: ExpandFloatRes_FASIN(N, Lo, Hi); break; case ISD::STRICT_FATAN: case ISD::FATAN: ExpandFloatRes_FATAN(N, Lo, Hi); break; + case ISD::STRICT_FATAN2: + case ISD::FATAN2: ExpandFloatRes_FATAN2(N, Lo, Hi); break; case ISD::FCBRT: ExpandFloatRes_FCBRT(N, Lo, Hi); break; case ISD::STRICT_FCEIL: case ISD::FCEIL: ExpandFloatRes_FCEIL(N, Lo, Hi); break; @@ -1631,6 +1642,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FATAN(SDNode *N, SDValue &Lo, Lo, Hi); } +void DAGTypeLegalizer::ExpandFloatRes_FATAN2(SDNode *N, SDValue &Lo, + SDValue &Hi) { + ExpandFloatRes_Binary(N, + GetFPLibCall(N->getValueType(0), RTLIB::ATAN2_F32, + RTLIB::ATAN2_F64, RTLIB::ATAN2_F80, + RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128), + Lo, Hi); +} + void DAGTypeLegalizer::ExpandFloatRes_FCBRT(SDNode *N, SDValue &Lo, SDValue &Hi) { ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::CBRT_F32, @@ -2673,6 +2693,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMINNUM_IEEE: case ISD::FMUL: case ISD::FPOW: + case ISD::FATAN2: case ISD::FREM: case ISD::FSUB: R = PromoteFloatRes_BinOp(N); break; @@ -3115,6 +3136,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FMINNUM: case ISD::FMUL: case ISD::FPOW: + case ISD::FATAN2: case ISD::FREM: case ISD::FSUB: R = SoftPromoteHalfRes_BinOp(N); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d14516ef3e2fbb6c102764586a232d11dc5637d0..868da25ca8cb474e5d0255b6546e6fbacd840546 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -567,6 +567,7 @@ private: SDValue SoftenFloatRes_FACOS(SDNode *N); SDValue SoftenFloatRes_FASIN(SDNode *N); SDValue SoftenFloatRes_FATAN(SDNode *N); + SDValue SoftenFloatRes_FATAN2(SDNode *N); SDValue SoftenFloatRes_FMINNUM(SDNode *N); SDValue SoftenFloatRes_FMAXNUM(SDNode *N); SDValue SoftenFloatRes_FMINIMUMNUM(SDNode *N); @@ -661,6 +662,7 @@ private: void ExpandFloatRes_FACOS (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FASIN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FATAN (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandFloatRes_FATAN2 (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMINNUM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMAXNUM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMINIMUMNUM(SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index ffecca78a2252cc6bad4251694180692d295468b..a8042fc3e7a69a204e49a070919eb539a52f9bdc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -410,6 +410,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FASIN: case ISD::FACOS: case ISD::FATAN: + case ISD::FATAN2: case ISD::FSINH: case ISD::FCOSH: case ISD::FTANH: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 3cb6f3f56ec9ae6a7da9b1a5f8f1c800d0b8f663..b2e784aba56b28a4e10534fbb7432dfa6a636a80 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -164,6 +164,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::USHLSAT: case ISD::FPOW: + case ISD::FATAN2: case ISD::FREM: case ISD::FSUB: case ISD::MUL: @@ -1293,6 +1294,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::UDIV: case ISD::VP_UDIV: case ISD::FDIV: case ISD::VP_FDIV: case ISD::FPOW: + case ISD::FATAN2: case ISD::AND: case ISD::VP_AND: case ISD::OR: case ISD::VP_OR: case ISD::XOR: case ISD::VP_XOR: @@ -4582,6 +4584,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { break; case ISD::FPOW: + case ISD::FATAN2: case ISD::FREM: if (unrollExpandedOp()) break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index d342a14b9775fe385cb34bf62359393d154209d9..3888825bd4499d1664042e46989b7d806f58e2d0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1644,7 +1644,11 @@ SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT, assert((EltVT.getSizeInBits() >= 64 || (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) && "getConstant with a uint64_t value that doesn't fit in the type!"); - return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO); + // TODO: Avoid implicit trunc? + // See https://github.com/llvm/llvm-project/issues/112510. + return getConstant(APInt(EltVT.getSizeInBits(), Val, /*isSigned=*/false, + /*implicitTrunc=*/true), + DL, VT, isT, isO); } SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT, @@ -5474,6 +5478,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const case ISD::FASIN: case ISD::FACOS: case ISD::FATAN: + case ISD::FATAN2: case ISD::FSINH: case ISD::FCOSH: case ISD::FTANH: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 625974e782e37c62f0a8591c35ff9f540a2afa58..61f1bd8d5caac429764f0659a40ae3bc4f11e286 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3139,7 +3139,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD. // Otherwise, emit a volatile load to retrieve the stack guard value. SDValue Chain = DAG.getEntryNode(); - if (TLI.useLoadStackGuardNode()) { + if (TLI.useLoadStackGuardNode(M)) { Guard = getLoadStackGuard(DAG, dl, Chain); } else { const Value *IRGuard = TLI.getSDagStackGuard(M); @@ -4376,7 +4376,8 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { GTI.getSequentialElementStride(DAG.getDataLayout()); // We intentionally mask away the high bits here; ElementSize may not // fit in IdxTy. - APInt ElementMul(IdxSize, ElementSize.getKnownMinValue()); + APInt ElementMul(IdxSize, ElementSize.getKnownMinValue(), + /*isSigned=*/false, /*implicitTrunc=*/true); bool ElementScalable = ElementSize.isScalable(); // If this is a scalar constant or a splat vector of constants, @@ -6966,6 +6967,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, getValue(I.getArgOperand(0)), Flags)); return; } + case Intrinsic::atan2: + setValue(&I, DAG.getNode(ISD::FATAN2, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), Flags)); + return; case Intrinsic::lround: case Intrinsic::llround: case Intrinsic::lrint: @@ -7447,7 +7454,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, const Module &M = *MF.getFunction().getParent(); EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); SDValue Chain = getRoot(); - if (TLI.useLoadStackGuardNode()) { + if (TLI.useLoadStackGuardNode(M)) { Res = getLoadStackGuard(DAG, sdl, Chain); Res = DAG.getPtrExtOrTrunc(Res, sdl, PtrTy); } else { @@ -7467,9 +7474,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // Emit code into the DAG to store the stack guard onto the stack. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + const Module &M = *MF.getFunction().getParent(); SDValue Src, Chain = getRoot(); - if (TLI.useLoadStackGuardNode()) + if (TLI.useLoadStackGuardNode(M)) Src = getLoadStackGuard(DAG, sdl, Chain); else Src = getValue(I.getArgOperand(0)); // The guard's value. @@ -9546,6 +9554,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { if (visitUnaryFloatCall(I, ISD::FATAN)) return; break; + case LibFunc_atan2: + case LibFunc_atan2f: + case LibFunc_atan2l: + if (visitBinaryFloatCall(I, ISD::FATAN2)) + return; + break; case LibFunc_sinh: case LibFunc_sinhf: case LibFunc_sinhl: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 56fc538172f9fc87770e593db595494e07686e79..703efb70089742f09ec15ac4fa04884e093c805f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -227,6 +227,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::STRICT_FACOS: return "strict_facos"; case ISD::FATAN: return "fatan"; case ISD::STRICT_FATAN: return "strict_fatan"; + case ISD::FATAN2: return "fatan2"; + case ISD::STRICT_FATAN2: return "strict_fatan2"; case ISD::FSINH: return "fsinh"; case ISD::STRICT_FSINH: return "strict_fsinh"; case ISD::FCOSH: return "fcosh"; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 2a97580942df36c4b4956f7fdafc4192666527dd..9bd894dd952bda0d1107e3c0e892226d24ab78b6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2200,7 +2200,10 @@ ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() { bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS, int64_t DesiredMaskS) const { const APInt &ActualMask = RHS->getAPIntValue(); - const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS); + // TODO: Avoid implicit trunc? + // See https://github.com/llvm/llvm-project/issues/112510. + const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS, + /*isSigned=*/false, /*implicitTrunc=*/true); // If the actual mask exactly matches, success! if (ActualMask == DesiredMask) @@ -2229,7 +2232,10 @@ bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS, bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS, int64_t DesiredMaskS) const { const APInt &ActualMask = RHS->getAPIntValue(); - const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS); + // TODO: Avoid implicit trunc? + // See https://github.com/llvm/llvm-project/issues/112510. + const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS, + /*isSigned=*/false, /*implicitTrunc=*/true); // If the actual mask exactly matches, success! if (ActualMask == DesiredMask) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a05202e9d3cc43d6bd7c0512ae38816998dd3fb1..3cbe9aa63a0324501f15205be8125b1aa2c03d88 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6813,7 +6813,9 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, PAmts.push_back(DAG.getConstant(P, DL, SVT)); KAmts.push_back( - DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT)); + DAG.getConstant(APInt(ShSVT.getSizeInBits(), K, /*isSigned=*/false, + /*implicitTrunc=*/true), + DL, ShSVT)); QAmts.push_back(DAG.getConstant(Q, DL, SVT)); return true; }; @@ -7084,7 +7086,9 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode, PAmts.push_back(DAG.getConstant(P, DL, SVT)); AAmts.push_back(DAG.getConstant(A, DL, SVT)); KAmts.push_back( - DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT)); + DAG.getConstant(APInt(ShSVT.getSizeInBits(), K, /*isSigned=*/false, + /*implicitTrunc=*/true), + DL, ShSVT)); QAmts.push_back(DAG.getConstant(Q, DL, SVT)); return true; }; @@ -8813,6 +8817,31 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, IsOrdered ? OrderedOp : UnorderedOp); } } + + if (FPTestMask == fcNormal) { + // TODO: Handle unordered + ISD::CondCode IsFiniteOp = IsInvertedFP ? ISD::SETUGE : ISD::SETOLT; + ISD::CondCode IsNormalOp = IsInvertedFP ? ISD::SETOLT : ISD::SETUGE; + + if (isCondCodeLegalOrCustom(IsFiniteOp, + OperandVT.getScalarType().getSimpleVT()) && + isCondCodeLegalOrCustom(IsNormalOp, + OperandVT.getScalarType().getSimpleVT()) && + isFAbsFree(OperandVT)) { + // isnormal(x) --> fabs(x) < infinity && !(fabs(x) < smallest_normal) + SDValue Inf = + DAG.getConstantFP(APFloat::getInf(Semantics), DL, OperandVT); + SDValue SmallestNormal = DAG.getConstantFP( + APFloat::getSmallestNormalized(Semantics), DL, OperandVT); + + SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op); + SDValue IsFinite = DAG.getSetCC(DL, ResultVT, Abs, Inf, IsFiniteOp); + SDValue IsNormal = + DAG.getSetCC(DL, ResultVT, Abs, SmallestNormal, IsNormalOp); + unsigned LogicOp = IsInvertedFP ? ISD::OR : ISD::AND; + return DAG.getNode(LogicOp, DL, ResultVT, IsFinite, IsNormal); + } + } } // Some checks may be represented as inversion of simpler check, for example diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index a192161bbd9481dc207e1ab9e548886e3958aac1..0ce305c4410d271d8b1197d641a8a71f7bcdd994 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -519,8 +519,7 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M, if (SupportsSelectionDAGSP) *SupportsSelectionDAGSP = true; TLI->insertSSPDeclarations(*M); - return B.CreateCall( - Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard)); + return B.CreateIntrinsic(Intrinsic::stackguard, {}, {}); } /// Insert code into the entry block that stores the stack guard @@ -541,8 +540,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc, AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot"); Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP); - B.CreateCall(Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackprotector), - {GuardSlot, AI}); + B.CreateIntrinsic(Intrinsic::stackprotector, {}, {GuardSlot, AI}); return SupportsSelectionDAGSP; } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 1f49d60c97059310c06f5bee09569bad9dc6f1b8..7a28f7892cbf310abf4208bf82d6c7924e4f1cbe 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -783,7 +783,7 @@ void TargetLoweringBase::initActions() { ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, - ISD::FCOSH, ISD::FSINH, ISD::FTANH}, + ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2}, VT, Expand); // Constrained floating-point operations default to expand. @@ -842,7 +842,8 @@ void TargetLoweringBase::initActions() { ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FTAN, ISD::FACOS, ISD::FASIN, - ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH}, + ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH, + ISD::FATAN2}, {MVT::f32, MVT::f64, MVT::f128}, Expand); // FIXME: Query RuntimeLibCalls to make the decision. @@ -850,7 +851,7 @@ void TargetLoweringBase::initActions() { {MVT::f32, MVT::f64, MVT::f128}, LibCall); setOperationAction({ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, ISD::FCOSH, - ISD::FSINH, ISD::FTANH}, + ISD::FSINH, ISD::FTANH, ISD::FATAN2}, MVT::f16, Promote); // Default ISD::TRAP to expand (which turns it into abort). setOperationAction(ISD::TRAP, MVT::Other, Expand); diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index 5bf1d265092f6f855f4cf1cc9974fcc75ceb2e26..01ffaed585ea163cb87f528abf2ba812eabaaa02 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -22,10 +22,6 @@ using namespace llvm; /// DisableFramePointerElim - This returns true if frame pointer elimination /// optimization should be disabled for the given machine function. bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { - // Check to see if the target want to forcibly keep frame pointer. - if (MF.getSubtarget().getFrameLowering()->keepFramePointer(MF)) - return true; - const Function &F = MF.getFunction(); if (!F.hasFnAttribute("frame-pointer")) @@ -41,10 +37,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { } bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const { - // Check to see if the target want to forcibly keep frame pointer. - if (MF.getSubtarget().getFrameLowering()->keepFramePointer(MF)) - return true; - const Function &F = MF.getFunction(); if (!F.hasFnAttribute("frame-pointer")) diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index cf9d63df2515ccc52e9a847889c833f69f4435b7..02c3a852697580d0a6b311fb1794c5d9a33e2256 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -305,7 +305,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID, if (StandardID == &DeadMachineInstructionElimID) return applyDisable(TargetID, DisableMachineDCE); - if (StandardID == &EarlyIfConverterID) + if (StandardID == &EarlyIfConverterLegacyID) return applyDisable(TargetID, DisableEarlyIfConversion); if (StandardID == &EarlyMachineLICMID) @@ -521,7 +521,7 @@ void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC, DISABLE_PASS(DisableBlockPlacement, MachineBlockPlacementPass) DISABLE_PASS(DisableBranchFold, BranchFolderPass) DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass) - DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass) + DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterLegacyPass) DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass) DISABLE_PASS(DisableMachineCSE, MachineCSELegacyPass) DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass) diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp index 894abf5777f1614eec6d05c2b059ae9ad2574fc3..3f5604e6aa4b06204432e26dacb077e8ff0cf048 100644 --- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp +++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp @@ -699,7 +699,6 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath, Log << " [" << Idx << "]: " << gii.Name << " @ " << gii.Dir << '/' << gii.Base << ':' << gii.Line << '\n'; } - DwarfInlineInfos = DICtx.getInliningInfoForAddress(SectAddr, DLIS); Gsym->dump(Log, *FI); } continue; diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp index e85356b5eab0f4eda279a4582e004ca45c9973b2..1c523c0131497718b664dea9eee72076c956a278 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp @@ -1139,9 +1139,8 @@ void LVDWARFReader::updateReference(dwarf::Attribute Attr, // Get an element given the DIE offset. LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset, LVElement *Element, bool IsType) { - auto Iter = ElementTable.try_emplace(Offset).first; // Update the element and all the references pointing to this element. - LVElementEntry &Entry = Iter->second; + LVElementEntry &Entry = ElementTable[Offset]; if (!Entry.Element) { if (IsType) Entry.Types.insert(Element); diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp index 860165365a7e4f2a92fc4b1b9be6925e0be0ea08..2d5f28cad1cc6d24844c7d1b636f95af65adf73e 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp @@ -186,15 +186,29 @@ private: int64_t Addend = 0; switch (*Kind) { - case i386::EdgeKind_i386::Delta32: { + case i386::EdgeKind_i386::None: + break; + case i386::EdgeKind_i386::Pointer32: + case i386::EdgeKind_i386::PCRel32: + case i386::EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT: + case i386::EdgeKind_i386::Delta32: + case i386::EdgeKind_i386::Delta32FromGOT: + case i386::EdgeKind_i386::BranchPCRel32: + case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStub: + case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStubBypassable: { const char *FixupContent = BlockToFix.getContent().data() + (FixupAddress - BlockToFix.getAddress()); - Addend = *(const support::ulittle32_t *)FixupContent; + Addend = *(const support::little32_t *)FixupContent; break; } - default: + case i386::EdgeKind_i386::Pointer16: + case i386::EdgeKind_i386::PCRel16: { + const char *FixupContent = BlockToFix.getContent().data() + + (FixupAddress - BlockToFix.getAddress()); + Addend = *(const support::little16_t *)FixupContent; break; } + } Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); Edge GE(*Kind, Offset, *GraphSymbol, Addend); diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index 6a32ccc377651047f4aef9074670c87718dd2619..44122726fb5c08dd16ddc65031c0c16d4dbade3b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -182,6 +182,12 @@ private: case ELF::R_X86_64_64: Kind = x86_64::Pointer64; break; + case ELF::R_X86_64_SIZE32: + Kind = x86_64::Size32; + break; + case ELF::R_X86_64_SIZE64: + Kind = x86_64::Size64; + break; case ELF::R_X86_64_GOTPCREL: Kind = x86_64::RequestGOTAndTransformToDelta32; break; diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp index cca4358a377660643d74211b60bea9661fe760a5..e5b48d2c3fab0e4888f7b0569aba7aa6f8f34ae7 100644 --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -42,6 +42,10 @@ const char *getEdgeKindName(Edge::Kind K) { return "NegDelta64"; case NegDelta32: return "NegDelta32"; + case Size64: + return "Size64"; + case Size32: + return "Size32"; case Delta64FromGOT: return "Delta64FromGOT"; case PCRel32: diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp index 4cce4a77b343f0e70ab42a395f2782ac3c1f6713..e3b7db2380bb00c13efed1cdaeeb63b21b250f29 100644 --- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp +++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp @@ -588,7 +588,7 @@ GenericValue MCJIT::runFunction(Function *F, ArrayRef ArgValues) { return rv; } case Type::VoidTyID: - rv.IntVal = APInt(32, ((int(*)())(intptr_t)FPtr)()); + rv.IntVal = APInt(32, ((int (*)())(intptr_t)FPtr)(), true); return rv; case Type::FloatTyID: rv.FloatVal = ((float(*)())(intptr_t)FPtr)(); diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp index cdfcae86f79c282419d8f75ca4edea1c9bc64197..f46cb906bb755683791aea935f63a34239667b92 100644 --- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp @@ -159,11 +159,14 @@ private: namespace llvm { namespace orc { -Expected> COFFPlatform::Create( - ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, std::unique_ptr OrcRuntimeArchiveBuffer, - LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime, - const char *VCRuntimePath, std::optional RuntimeAliases) { +Expected> +COFFPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, + std::unique_ptr OrcRuntimeArchiveBuffer, + LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime, + const char *VCRuntimePath, + std::optional RuntimeAliases) { + + auto &ES = ObjLinkingLayer.getExecutionSession(); // If the target is not supported then bail out immediately. if (!supportedTarget(ES.getTargetTriple())) @@ -214,7 +217,7 @@ Expected> COFFPlatform::Create( // Create the instance. Error Err = Error::success(); auto P = std::unique_ptr(new COFFPlatform( - ES, ObjLinkingLayer, PlatformJD, std::move(*OrcRuntimeArchiveGenerator), + ObjLinkingLayer, PlatformJD, std::move(*OrcRuntimeArchiveGenerator), std::move(OrcRuntimeArchiveBuffer), std::move(RuntimeArchive), std::move(LoadDynLibrary), StaticVCRuntime, VCRuntimePath, Err)); if (Err) @@ -223,8 +226,8 @@ Expected> COFFPlatform::Create( } Expected> -COFFPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, const char *OrcRuntimePath, +COFFPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, + const char *OrcRuntimePath, LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime, const char *VCRuntimePath, std::optional RuntimeAliases) { @@ -233,7 +236,7 @@ COFFPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, if (!ArchiveBuffer) return createFileError(OrcRuntimePath, ArchiveBuffer.getError()); - return Create(ES, ObjLinkingLayer, PlatformJD, std::move(*ArchiveBuffer), + return Create(ObjLinkingLayer, PlatformJD, std::move(*ArchiveBuffer), std::move(LoadDynLibrary), StaticVCRuntime, VCRuntimePath, std::move(RuntimeAliases)); } @@ -382,14 +385,14 @@ bool COFFPlatform::supportedTarget(const Triple &TT) { } COFFPlatform::COFFPlatform( - ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer, - JITDylib &PlatformJD, + ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD, std::unique_ptr OrcRuntimeGenerator, std::unique_ptr OrcRuntimeArchiveBuffer, std::unique_ptr OrcRuntimeArchive, LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime, const char *VCRuntimePath, Error &Err) - : ES(ES), ObjLinkingLayer(ObjLinkingLayer), + : ES(ObjLinkingLayer.getExecutionSession()), + ObjLinkingLayer(ObjLinkingLayer), LoadDynLibrary(std::move(LoadDynLibrary)), OrcRuntimeArchiveBuffer(std::move(OrcRuntimeArchiveBuffer)), OrcRuntimeArchive(std::move(OrcRuntimeArchive)), diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index c4a65ebbe5a3b89e83bc91159f2c33639c7bde89..1dcf91443d55db6276ca0789feeb8332e79ea297 100644 --- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -273,9 +273,24 @@ Error DynamicLibrarySearchGenerator::tryToGenerate( return JD.define(absoluteSymbols(std::move(NewSymbols))); } +StaticLibraryDefinitionGenerator::VisitMembersFunction +StaticLibraryDefinitionGenerator::loadAllObjectFileMembers(ObjectLayer &L, + JITDylib &JD) { + return [&](MemoryBufferRef Buf) -> Error { + switch (identify_magic(Buf.getBuffer())) { + case file_magic::elf_relocatable: + case file_magic::macho_object: + case file_magic::coff_object: + return L.add(JD, MemoryBuffer::getMemBuffer(Buf)); + default: + return Error::success(); + } + }; +} + Expected> StaticLibraryDefinitionGenerator::Load( - ObjectLayer &L, const char *FileName, + ObjectLayer &L, const char *FileName, VisitMembersFunction VisitMembers, GetObjectFileInterface GetObjFileInterface) { const auto &TT = L.getExecutionSession().getTargetTriple(); @@ -283,17 +298,33 @@ StaticLibraryDefinitionGenerator::Load( if (!Linkable) return Linkable.takeError(); - return Create(L, std::move(Linkable->first), std::move(GetObjFileInterface)); + return Create(L, std::move(Linkable->first), std::move(VisitMembers), + std::move(GetObjFileInterface)); } Expected> StaticLibraryDefinitionGenerator::Create( ObjectLayer &L, std::unique_ptr ArchiveBuffer, - std::unique_ptr Archive, + std::unique_ptr Archive, VisitMembersFunction VisitMembers, GetObjectFileInterface GetObjFileInterface) { Error Err = Error::success(); + if (VisitMembers) { + for (auto Child : Archive->children(Err)) { + if (auto ChildBuf = Child.getMemoryBufferRef()) { + if (auto Err2 = VisitMembers(*ChildBuf)) + return std::move(Err2); + } else { + // We silently allow non-object archive members. This matches the + // behavior of ld. + consumeError(ChildBuf.takeError()); + } + } + if (Err) + return std::move(Err); + } + std::unique_ptr ADG( new StaticLibraryDefinitionGenerator( L, std::move(ArchiveBuffer), std::move(Archive), @@ -308,6 +339,7 @@ StaticLibraryDefinitionGenerator::Create( Expected> StaticLibraryDefinitionGenerator::Create( ObjectLayer &L, std::unique_ptr ArchiveBuffer, + VisitMembersFunction VisitMembers, GetObjectFileInterface GetObjFileInterface) { auto B = object::createBinary(ArchiveBuffer->getMemBufferRef()); @@ -319,7 +351,7 @@ StaticLibraryDefinitionGenerator::Create( return Create(L, std::move(ArchiveBuffer), std::unique_ptr( static_cast(B->release())), - std::move(GetObjFileInterface)); + std::move(VisitMembers), std::move(GetObjFileInterface)); // If this is a universal binary then search for a slice matching the given // Triple. @@ -341,7 +373,7 @@ StaticLibraryDefinitionGenerator::Create( return Archive.takeError(); return Create(L, std::move(ArchiveBuffer), std::move(*Archive), - std::move(GetObjFileInterface)); + std::move(VisitMembers), std::move(GetObjFileInterface)); } return make_error(Twine("Unrecognized file type for ") + diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index d3db89a2c3e9f1062c100b9c35aa5e10b7d92224..401ed525fd5cfe0ee320ed8e1c733487c2922503 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/LLJIT.h" + +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS #include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h" #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h" @@ -195,10 +197,14 @@ public: auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT); auto *AtExitCallbackTy = FunctionType::get(VoidTy, {}, false); auto *AtExitCallbackPtrTy = PointerType::getUnqual(AtExitCallbackTy); - addHelperAndWrapper(*M, "atexit", - FunctionType::get(IntTy, {AtExitCallbackPtrTy}, false), - GlobalValue::HiddenVisibility, "__lljit.atexit_helper", - {PlatformInstanceDecl, DSOHandle}); + auto *AtExit = addHelperAndWrapper( + *M, "atexit", FunctionType::get(IntTy, {AtExitCallbackPtrTy}, false), + GlobalValue::HiddenVisibility, "__lljit.atexit_helper", + {PlatformInstanceDecl, DSOHandle}); + Attribute::AttrKind AtExitExtAttr = + TargetLibraryInfo::getExtAttrForI32Return(J.getTargetTriple()); + if (AtExitExtAttr != Attribute::None) + AtExit->addRetAttr(AtExitExtAttr); return J.addIRModule(JD, ThreadSafeModule(std::move(M), std::move(Ctx))); } @@ -602,7 +608,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) { using llvm::orc::shared::SPSExecutorAddr; using llvm::orc::shared::SPSString; using SPSDLOpenSig = SPSExecutorAddr(SPSString, int32_t); - using SPSDLUpdateSig = int32_t(SPSExecutorAddr, int32_t); + using SPSDLUpdateSig = int32_t(SPSExecutorAddr); enum dlopen_mode : int32_t { ORC_RT_RTLD_LAZY = 0x1, ORC_RT_RTLD_NOW = 0x2, @@ -628,8 +634,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) { if (dlupdate) { int32_t result; auto E = ES.callSPSWrapper(WrapperAddr->getAddress(), - result, DSOHandles[&JD], - int32_t(ORC_RT_RTLD_LAZY)); + result, DSOHandles[&JD]); if (E) return E; else if (result) @@ -1172,7 +1177,7 @@ Expected ExecutorNativePlatform::operator()(LLJIT &J) { StaticVCRuntime = VCRuntime->second; } if (auto P = COFFPlatform::Create( - ES, *ObjLinkingLayer, PlatformJD, std::move(RuntimeArchiveBuffer), + *ObjLinkingLayer, PlatformJD, std::move(RuntimeArchiveBuffer), LoadAndLinkDynLibrary(J), StaticVCRuntime, VCRuntimePath)) J.getExecutionSession().setPlatform(std::move(*P)); else diff --git a/llvm/lib/ExecutionEngine/Orc/MachO.cpp b/llvm/lib/ExecutionEngine/Orc/MachO.cpp index dc6a55758d257941dad0f7524bea7bc803299c76..784c3487d64fc4567bdd6c4d28e76bd0f814e70c 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachO.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachO.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/MachO.h" +#include "llvm/ExecutionEngine/Orc/Layer.h" #include "llvm/Object/MachOUniversal.h" #include "llvm/Support/FileSystem.h" @@ -228,5 +229,39 @@ getMachOSliceRangeForTriple(MemoryBufferRef UBBuf, const Triple &TT) { return getMachOSliceRangeForTriple(**UB, TT); } +Error ForceLoadMachOArchiveMembers::operator()(MemoryBufferRef MemberBuf) { + if (!ObjCOnly) + return L.add(JD, MemoryBuffer::getMemBuffer(MemberBuf)); + + // We need to check whether this archive member contains any Objective-C + // or Swift metadata. + + auto Obj = object::ObjectFile::createObjectFile(MemberBuf); + if (!Obj) { + // We silently ignore invalid files. + consumeError(Obj.takeError()); + return Error::success(); + } + + if (auto *MachOObj = dyn_cast(&**Obj)) { + // Load the object if any recognized special section is present. + for (auto Sec : MachOObj->sections()) { + auto SegName = + MachOObj->getSectionFinalSegmentName(Sec.getRawDataRefImpl()); + if (auto SecName = Sec.getName()) { + if (*SecName == "__objc_classlist" || *SecName == "__objc_protolist" || + *SecName == "__objc_clsrolist" || *SecName == "__objc_catlist" || + *SecName == "__objc_catlist2" || *SecName == "__objc_nlcatlist" || + (SegName == "__TEXT" && (*SecName).starts_with("__swift") && + *SecName != "__swift_modhash")) + return L.add(JD, MemoryBuffer::getMemBuffer(MemberBuf)); + } else + return SecName.takeError(); + } + } + + return Error::success(); +} + } // End namespace orc. } // End namespace llvm. diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index c2fba49692c77431c3da0acd04b8e4a5b05f9ee3..e9daa01b899e8f3c0cc40ff00f543a5f9a6a03e5 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -1931,6 +1931,14 @@ AttributeList::getParamDereferenceableOrNullBytes(unsigned Index) const { return getParamAttrs(Index).getDereferenceableOrNullBytes(); } +std::optional +AttributeList::getParamRange(unsigned ArgNo) const { + auto RangeAttr = getParamAttrs(ArgNo).getAttribute(Attribute::Range); + if (RangeAttr.isValid()) + return RangeAttr.getRange(); + return std::nullopt; +} + FPClassTest AttributeList::getRetNoFPClass() const { return getRetAttrs().getNoFPClass(); } @@ -2277,6 +2285,13 @@ Attribute AttrBuilder::getAttribute(StringRef A) const { return {}; } +std::optional AttrBuilder::getRange() const { + const Attribute RangeAttr = getAttribute(Attribute::Range); + if (RangeAttr.isValid()) + return RangeAttr.getRange(); + return std::nullopt; +} + bool AttrBuilder::contains(Attribute::AttrKind A) const { return getAttribute(A).isValid(); } @@ -2300,7 +2315,7 @@ bool AttributeFuncs::isNoFPClassCompatibleType(Type *Ty) { } /// Which attributes cannot be applied to a type. -AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, +AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, AttributeSet AS, AttributeSafetyKind ASK) { AttributeMask Incompatible; @@ -2316,6 +2331,11 @@ AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, // Attributes that only apply to integers or vector of integers. if (ASK & ASK_SAFE_TO_DROP) Incompatible.addAttribute(Attribute::Range); + } else { + Attribute RangeAttr = AS.getAttribute(Attribute::Range); + if (RangeAttr.isValid() && + RangeAttr.getRange().getBitWidth() != Ty->getScalarSizeInBits()) + Incompatible.addAttribute(Attribute::Range); } if (!Ty->isPointerTy()) { diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 477b77a6dd5335a618275876e14f323f880e44f9..bb03c9290e4cf4135c1289628c053893753c2565 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1745,8 +1745,7 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI, if (!IndexForm) std::swap(Args[0], Args[1]); - Value *V = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args); + Value *V = Builder.CreateIntrinsic(IID, {}, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Builder.CreateBitCast(CI.getArgOperand(1), Ty); @@ -1758,8 +1757,7 @@ static Value *upgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI, Type *Ty = CI.getType(); Value *Op0 = CI.getOperand(0); Value *Op1 = CI.getOperand(1); - Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); - Value *Res = Builder.CreateCall(Intrin, {Op0, Op1}); + Value *Res = Builder.CreateIntrinsic(IID, Ty, {Op0, Op1}); if (CI.arg_size() == 4) { // For masked intrinsics. Value *VecSrc = CI.getOperand(2); @@ -1785,8 +1783,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallBase &CI, } Intrinsic::ID IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl; - Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); - Value *Res = Builder.CreateCall(Intrin, {Src, Src, Amt}); + Value *Res = Builder.CreateIntrinsic(IID, Ty, {Src, Src, Amt}); if (CI.arg_size() == 4) { // For masked intrinsics. Value *VecSrc = CI.getOperand(2); @@ -1855,8 +1852,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI, } Intrinsic::ID IID = IsShiftRight ? Intrinsic::fshr : Intrinsic::fshl; - Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); - Value *Res = Builder.CreateCall(Intrin, {Op0, Op1, Amt}); + Value *Res = Builder.CreateIntrinsic(IID, Ty, {Op0, Op1, Amt}); unsigned NumArgs = CI.arg_size(); if (NumArgs >= 4) { // For masked intrinsics. @@ -1916,9 +1912,8 @@ static Value *upgradeMaskedLoad(IRBuilder<> &Builder, Value *Ptr, static Value *upgradeAbs(IRBuilder<> &Builder, CallBase &CI) { Type *Ty = CI.getType(); Value *Op0 = CI.getArgOperand(0); - Function *F = - Intrinsic::getOrInsertDeclaration(CI.getModule(), Intrinsic::abs, Ty); - Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)}); + Value *Res = Builder.CreateIntrinsic(Intrinsic::abs, Ty, + {Op0, Builder.getInt1(false)}); if (CI.arg_size() == 3) Res = emitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1)); return Res; @@ -2010,9 +2005,8 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI, // Replace a masked intrinsic with an older unmasked intrinsic. static Value *upgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI, Intrinsic::ID IID) { - Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID); - Value *Rep = Builder.CreateCall(Intrin, - { CI.getArgOperand(0), CI.getArgOperand(1) }); + Value *Rep = Builder.CreateIntrinsic( + IID, {}, {CI.getArgOperand(0), CI.getArgOperand(1)}); return emitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2)); } @@ -2269,8 +2263,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, SmallVector Args(CI.args()); Args.pop_back(); Args.pop_back(); - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args); + Rep = Builder.CreateIntrinsic(IID, {}, Args); unsigned NumArgs = CI.arg_size(); Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep, CI.getArgOperand(NumArgs - 2)); @@ -2325,25 +2318,21 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, } else if (Name == "clz.ll") { // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64. Value *Arg = CI->getArgOperand(0); - Value *Ctlz = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, - {Arg->getType()}), - {Arg, Builder.getFalse()}, "ctlz"); + Value *Ctlz = Builder.CreateIntrinsic(Intrinsic::ctlz, {Arg->getType()}, + {Arg, Builder.getFalse()}, + /*FMFSource=*/nullptr, "ctlz"); Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); } else if (Name == "popc.ll") { // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an // i64. Value *Arg = CI->getArgOperand(0); - Value *Popc = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctpop, - {Arg->getType()}), - Arg, "ctpop"); + Value *Popc = Builder.CreateIntrinsic(Intrinsic::ctpop, {Arg->getType()}, + Arg, /*FMFSource=*/nullptr, "ctpop"); Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); } else if (Name == "h2f") { - Rep = Builder.CreateCall(Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::convert_from_fp16, - {Builder.getFloatTy()}), - CI->getArgOperand(0), "h2f"); + Rep = Builder.CreateIntrinsic(Intrinsic::convert_from_fp16, + {Builder.getFloatTy()}, CI->getArgOperand(0), + /*FMFSource=*/nullptr, "h2f"); } else if (Name.consume_front("bitcast.") && (Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll")) { @@ -2486,17 +2475,13 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, } else if (Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd") { Value *Vec = CI->getArgOperand(0); Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0); - Function *Intr = Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::sqrt, Elt0->getType()); - Elt0 = Builder.CreateCall(Intr, Elt0); + Elt0 = Builder.CreateIntrinsic(Intrinsic::sqrt, Elt0->getType(), Elt0); Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0); } else if (Name.starts_with("avx.sqrt.p") || Name.starts_with("sse2.sqrt.p") || Name.starts_with("sse.sqrt.p")) { - Rep = - Builder.CreateCall(Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::sqrt, CI->getType()), - {CI->getArgOperand(0)}); + Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(), + {CI->getArgOperand(0)}); } else if (Name.starts_with("avx512.mask.sqrt.p")) { if (CI->arg_size() == 4 && (!isa(CI->getArgOperand(3)) || @@ -2505,13 +2490,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, : Intrinsic::x86_avx512_sqrt_pd_512; Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)}; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); + Rep = Builder.CreateIntrinsic(IID, {}, Args); } else { - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::sqrt, - CI->getType()), - {CI->getArgOperand(0)}); + Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(), + {CI->getArgOperand(0)}); } Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); @@ -2635,9 +2617,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, break; } - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = Builder.CreateIntrinsic(IID, {}, + {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.mask.fpclass.p")) { Type *OpTy = CI->getArgOperand(0)->getType(); @@ -2659,9 +2640,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else llvm_unreachable("Unexpected intrinsic"); - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = Builder.CreateIntrinsic(IID, {}, + {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.cmp.p")) { SmallVector Args(CI->args()); @@ -2689,8 +2669,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, std::swap(Mask, Args.back()); Args.push_back(Mask); - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Args); + Rep = Builder.CreateIntrinsic(IID, {}, Args); } else if (Name.starts_with("avx512.mask.cmp.")) { // Integer compare intrinsics. unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); @@ -2784,9 +2763,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, cast(CI->getArgOperand(3))->getZExtValue() != 4)) { Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round : Intrinsic::x86_avx512_sitofp_round; - Function *F = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID, - {DstTy, SrcTy}); - Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)}); + Rep = Builder.CreateIntrinsic(IID, {DstTy, SrcTy}, + {Rep, CI->getArgOperand(3)}); } else { Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt") : Builder.CreateSIToFP(Rep, DstTy, "cvt"); @@ -2827,9 +2805,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2), ResultTy->getNumElements()); - Function *ELd = Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::masked_expandload, ResultTy); - Rep = Builder.CreateCall(ELd, {Ptr, MaskVec, CI->getOperand(1)}); + Rep = Builder.CreateIntrinsic(Intrinsic::masked_expandload, ResultTy, + {Ptr, MaskVec, CI->getOperand(1)}); } else if (Name.starts_with("avx512.mask.compress.store.")) { auto *ResultTy = cast(CI->getArgOperand(1)->getType()); Type *PtrTy = ResultTy->getElementType(); @@ -2842,9 +2819,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, getX86MaskVec(Builder, CI->getArgOperand(2), cast(ResultTy)->getNumElements()); - Function *CSt = Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::masked_compressstore, ResultTy); - Rep = Builder.CreateCall(CSt, {CI->getArgOperand(1), Ptr, MaskVec}); + Rep = Builder.CreateIntrinsic(Intrinsic::masked_compressstore, ResultTy, + {CI->getArgOperand(1), Ptr, MaskVec}); } else if (Name.starts_with("avx512.mask.compress.") || Name.starts_with("avx512.mask.expand.")) { auto *ResultTy = cast(CI->getType()); @@ -2855,10 +2831,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, bool IsCompress = Name[12] == 'c'; Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress : Intrinsic::x86_avx512_mask_expand; - Function *Intr = - Intrinsic::getOrInsertDeclaration(F->getParent(), IID, ResultTy); - Rep = Builder.CreateCall(Intr, - {CI->getOperand(0), CI->getOperand(1), MaskVec}); + Rep = Builder.CreateIntrinsic( + IID, ResultTy, {CI->getOperand(0), CI->getOperand(1), MaskVec}); } else if (Name.starts_with("xop.vpcom")) { bool IsSigned; if (Name.ends_with("ub") || Name.ends_with("uw") || Name.ends_with("ud") || @@ -2919,11 +2893,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, bool ZeroMask = Name[11] == 'z'; Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask); } else if (Name == "sse42.crc32.64.8") { - Function *CRC32 = Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::x86_sse42_crc32_32_8); Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C)); - Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)}); + Rep = Builder.CreateIntrinsic(Intrinsic::x86_sse42_crc32_32_8, {}, + {Trunc0, CI->getArgOperand(1)}); Rep = Builder.CreateZExt(Rep, CI->getType(), ""); } else if (Name.starts_with("avx.vbroadcast.s") || Name.starts_with("avx512.vbroadcast.s")) { @@ -3413,8 +3386,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_add_pd_512; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + Rep = Builder.CreateIntrinsic( + IID, {}, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3429,8 +3402,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_div_pd_512; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + Rep = Builder.CreateIntrinsic( + IID, {}, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3445,8 +3418,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_mul_pd_512; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + Rep = Builder.CreateIntrinsic( + IID, {}, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3461,8 +3434,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_sub_pd_512; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + Rep = Builder.CreateIntrinsic( + IID, {}, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3479,16 +3452,15 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, {Intrinsic::x86_avx512_min_ps_512, Intrinsic::x86_avx512_min_pd_512}}; Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble]; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + Rep = Builder.CreateIntrinsic( + IID, {}, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.mask.lzcnt.")) { Rep = - Builder.CreateCall(Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::ctlz, CI->getType()), - {CI->getArgOperand(0), Builder.getInt1(false)}); + Builder.CreateIntrinsic(Intrinsic::ctlz, CI->getType(), + {CI->getArgOperand(0), Builder.getInt1(false)}); Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); } else if (Name.starts_with("avx512.mask.psll")) { @@ -3732,10 +3704,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (NegAcc) Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma, - Ops[0]->getType()), - Ops); + Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops); if (IsScalar) Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0); @@ -3747,10 +3716,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0); Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0); - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma, - Ops[0]->getType()), - Ops); + Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops); Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()), Rep, (uint64_t)0); @@ -3790,12 +3756,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_vfmadd_f64; else IID = Intrinsic::x86_avx512_vfmadd_f32; - Function *FMA = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID); - Rep = Builder.CreateCall(FMA, Ops); + Rep = Builder.CreateIntrinsic(IID, {}, Ops); } else { - Function *FMA = Intrinsic::getOrInsertDeclaration( - CI->getModule(), Intrinsic::fma, A->getType()); - Rep = Builder.CreateCall(FMA, {A, B, C}); + Rep = Builder.CreateIntrinsic(Intrinsic::fma, A->getType(), {A, B, C}); } Value *PassThru = IsMaskZ ? Constant::getNullValue(Rep->getType()) @@ -3846,13 +3809,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_vfmadd_pd_512; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), - {A, B, C, CI->getArgOperand(4)}); + Rep = Builder.CreateIntrinsic(IID, {}, {A, B, C, CI->getArgOperand(4)}); } else { - Function *FMA = Intrinsic::getOrInsertDeclaration( - CI->getModule(), Intrinsic::fma, A->getType()); - Rep = Builder.CreateCall(FMA, {A, B, C}); + Rep = Builder.CreateIntrinsic(Intrinsic::fma, A->getType(), {A, B, C}); } Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) @@ -3878,8 +3837,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops); + Rep = Builder.CreateIntrinsic(IID, {}, Ops); } else if (Name.starts_with("avx512.mask.vfmaddsub.p") || Name.starts_with("avx512.mask3.vfmaddsub.p") || Name.starts_with("avx512.maskz.vfmaddsub.p") || @@ -3902,8 +3860,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (IsSubAdd) Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops); + Rep = Builder.CreateIntrinsic(IID, {}, Ops); } else { int NumElts = cast(CI->getType())->getNumElements(); @@ -3954,8 +3911,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), CI->getArgOperand(3)}; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); + Rep = Builder.CreateIntrinsic(IID, {}, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru); @@ -3982,8 +3938,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); + Rep = Builder.CreateIntrinsic(IID, {}, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4018,8 +3973,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); + Rep = Builder.CreateIntrinsic(IID, {}, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4048,8 +4002,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); + Rep = Builder.CreateIntrinsic(IID, {}, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4071,8 +4024,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, // Make a call with 3 operands. Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Value *NewCall = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); + Value *NewCall = Builder.CreateIntrinsic(IID, {}, Args); // Extract the second result and store it. Value *Data = Builder.CreateExtractValue(NewCall, 1); @@ -4118,8 +4070,8 @@ static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI, Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]); - Function *NewF = Intrinsic::getOrInsertDeclaration(CI->getModule(), NewID); - return Builder.CreateCall(NewF, Args, CI->getName()); + return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr, + CI->getName()); } static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, @@ -4127,20 +4079,15 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (Name == "mve.vctp64.old") { // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the // correct type. - Value *VCTP = - Builder.CreateCall(Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::arm_mve_vctp64), - CI->getArgOperand(0), CI->getName()); - Value *C1 = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::arm_mve_pred_v2i, - {VectorType::get(Builder.getInt1Ty(), 2, false)}), - VCTP); - return Builder.CreateCall( - Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::arm_mve_pred_i2v, - {VectorType::get(Builder.getInt1Ty(), 4, false)}), - C1); + Value *VCTP = Builder.CreateIntrinsic(Intrinsic::arm_mve_vctp64, {}, + CI->getArgOperand(0), + /*FMFSource=*/nullptr, CI->getName()); + Value *C1 = Builder.CreateIntrinsic( + Intrinsic::arm_mve_pred_v2i, + {VectorType::get(Builder.getInt1Ty(), 2, false)}, VCTP); + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_pred_i2v, + {VectorType::get(Builder.getInt1Ty(), 4, false)}, C1); } else if (Name == "mve.mull.int.predicated.v2i64.v4i32.v4i1" || Name == "mve.vqdmull.predicated.v2i64.v4i32.v4i1" || Name == "mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" || @@ -4198,21 +4145,16 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, for (Value *Op : CI->args()) { Type *Ty = Op->getType(); if (Ty->getScalarSizeInBits() == 1) { - Value *C1 = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::arm_mve_pred_v2i, - {VectorType::get(Builder.getInt1Ty(), 4, false)}), - Op); - Op = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::arm_mve_pred_i2v, {V2I1Ty}), - C1); + Value *C1 = Builder.CreateIntrinsic( + Intrinsic::arm_mve_pred_v2i, + {VectorType::get(Builder.getInt1Ty(), 4, false)}, Op); + Op = Builder.CreateIntrinsic(Intrinsic::arm_mve_pred_i2v, {V2I1Ty}, C1); } Ops.push_back(Op); } - Function *Fn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); - return Builder.CreateCall(Fn, Ops, CI->getName()); + return Builder.CreateIntrinsic(ID, Tys, Ops, /*FMFSource=*/nullptr, + CI->getName()); } llvm_unreachable("Unknown function for ARM CallBase upgrade."); } @@ -5436,9 +5378,11 @@ void llvm::UpgradeFunctionAttributes(Function &F) { } // Remove all incompatibile attributes from function. - F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType())); + F.removeRetAttrs(AttributeFuncs::typeIncompatible( + F.getReturnType(), F.getAttributes().getRetAttrs())); for (auto &Arg : F.args()) - Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType())); + Arg.removeAttrs( + AttributeFuncs::typeIncompatible(Arg.getType(), Arg.getAttributes())); // Older versions of LLVM treated an "implicit-section-name" attribute // similarly to directly setting the section on a Function. @@ -5577,11 +5521,24 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { return Res; } + auto AddPtr32Ptr64AddrSpaces = [&DL, &Res]() { + // If the datalayout matches the expected format, add pointer size address + // spaces to the datalayout. + StringRef AddrSpaces{"-p270:32:32-p271:32:32-p272:64:64"}; + if (!DL.contains(AddrSpaces)) { + SmallVector Groups; + Regex R("^([Ee]-m:[a-z](-p:32:32)?)(-.*)$"); + if (R.match(Res, &Groups)) + Res = (Groups[1] + AddrSpaces + Groups[3]).str(); + } + }; + // AArch64 data layout upgrades. if (T.isAArch64()) { // Add "-Fn32" if (!DL.empty() && !DL.contains("-Fn32")) Res.append("-Fn32"); + AddPtr32Ptr64AddrSpaces(); return Res; } @@ -5600,15 +5557,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { if (!T.isX86()) return Res; - // If the datalayout matches the expected format, add pointer size address - // spaces to the datalayout. - std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64"; - if (StringRef Ref = Res; !Ref.contains(AddrSpaces)) { - SmallVector Groups; - Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)"); - if (R.match(Res, &Groups)) - Res = (Groups[1] + AddrSpaces + Groups[3]).str(); - } + AddPtr32Ptr64AddrSpaces(); // i128 values need to be 16-byte-aligned. LLVM already called into libgcc // for i128 operations prior to this being reflected in the data layout, and diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index aabbd04fd0c26ca838f82717c8260f4c5b9db544..33bf3d8c9231fb09e413b31b101a7830239645b4 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -933,7 +933,10 @@ Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) { } ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, bool isSigned) { - return get(Ty->getContext(), APInt(Ty->getBitWidth(), V, isSigned)); + // TODO: Avoid implicit trunc? + // See https://github.com/llvm/llvm-project/issues/112510. + return get(Ty->getContext(), + APInt(Ty->getBitWidth(), V, isSigned, /*implicitTrunc=*/true)); } Constant *ConstantInt::get(Type *Ty, const APInt& V) { diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 09b90713b9c7931e1269f666964255b11856dcc0..889295956dbf7c7a72658f61082ad063c0eb5344 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -359,6 +359,10 @@ Attribute Argument::getAttribute(Attribute::AttrKind Kind) const { return getParent()->getParamAttribute(getArgNo(), Kind); } +AttributeSet Argument::getAttributes() const { + return getParent()->getAttributes().getParamAttrs(getArgNo()); +} + //===----------------------------------------------------------------------===// // Helper Methods in Function //===----------------------------------------------------------------------===// diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 9e52689f6e0e2f7ca3a5ef135a92deb7d6d1473d..d0364419632f7cdac0b564aa5741fedc4d432946 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -90,10 +90,8 @@ Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) { assert(isa(Scaling) && "Expected constant integer"); if (cast(Scaling)->isZero()) return Scaling; - Module *M = GetInsertBlock()->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, - {Scaling->getType()}); - CallInst *CI = CreateCall(TheFn, {}, {}, Name); + CallInst *CI = + CreateIntrinsic(Intrinsic::vscale, {Scaling->getType()}, {}, {}, Name); return cast(Scaling)->isOne() ? CI : CreateMul(CI, Scaling); } @@ -140,12 +138,9 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) { Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)}; - Type *Tys[] = { Ptr->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset, Tys); + Type *Tys[] = {Ptr->getType(), Size->getType()}; - CallInst *CI = CreateCall(TheFn, Ops); + CallInst *CI = CreateIntrinsic(Intrinsic::memset, Tys, Ops); if (Align) cast(CI)->setDestAlignment(*Align); @@ -170,11 +165,8 @@ CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, MDNode *NoAliasTag) { Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)}; Type *Tys[] = {Dst->getType(), Size->getType()}; - Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset_inline, Tys); - CallInst *CI = CreateCall(TheFn, Ops); + CallInst *CI = CreateIntrinsic(Intrinsic::memset_inline, Tys, Ops); if (DstAlign) cast(CI)->setDestAlignment(*DstAlign); @@ -198,11 +190,9 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)}; Type *Tys[] = {Ptr->getType(), Size->getType()}; - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::memset_element_unordered_atomic, Tys); - CallInst *CI = CreateCall(TheFn, Ops); + CallInst *CI = + CreateIntrinsic(Intrinsic::memset_element_unordered_atomic, Tys, Ops); cast(CI)->setDestAlignment(Alignment); @@ -227,11 +217,9 @@ CallInst *IRBuilderBase::CreateMemTransferInst( IntrID == Intrinsic::memmove) && "Unexpected intrinsic ID"); Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; - Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration(M, IntrID, Tys); + Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; - CallInst *CI = CreateCall(TheFn, Ops); + CallInst *CI = CreateIntrinsic(IntrID, Tys, Ops); auto* MCI = cast(CI); if (DstAlign) @@ -266,11 +254,9 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy( "Pointer alignment must be at least element size"); Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)}; Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::memcpy_element_unordered_atomic, Tys); - CallInst *CI = CreateCall(TheFn, Ops); + CallInst *CI = + CreateIntrinsic(Intrinsic::memcpy_element_unordered_atomic, Tys, Ops); // Set the alignment of the pointer args. auto *AMCI = cast(CI); @@ -382,11 +368,9 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove( "Pointer alignment must be at least element size"); Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)}; Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::memmove_element_unordered_atomic, Tys); - CallInst *CI = CreateCall(TheFn, Ops); + CallInst *CI = + CreateIntrinsic(Intrinsic::memmove_element_unordered_atomic, Tys, Ops); // Set the alignment of the pointer args. CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), DstAlign)); @@ -410,27 +394,19 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove( } CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Src) { - Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Src}; Type *Tys[] = { Src->getType() }; - auto Decl = Intrinsic::getOrInsertDeclaration(M, ID, Tys); - return CreateCall(Decl, Ops); + return CreateIntrinsic(ID, Tys, Ops); } CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) { - Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; - auto Decl = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::vector_reduce_fadd, {Src->getType()}); - return CreateCall(Decl, Ops); + return CreateIntrinsic(Intrinsic::vector_reduce_fadd, {Src->getType()}, Ops); } CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) { - Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; - auto Decl = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::vector_reduce_fmul, {Src->getType()}); - return CreateCall(Decl, Ops); + return CreateIntrinsic(Intrinsic::vector_reduce_fmul, {Src->getType()}, Ops); } CallInst *IRBuilderBase::CreateAddReduce(Value *Src) { @@ -490,10 +466,7 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { assert(Size->getType() == getInt64Ty() && "lifetime.start requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::lifetime_start, {Ptr->getType()}); - return CreateCall(TheFn, Ops); + return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, Ops); } CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { @@ -505,10 +478,7 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { assert(Size->getType() == getInt64Ty() && "lifetime.end requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::lifetime_end, {Ptr->getType()}); - return CreateCall(TheFn, Ops); + return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, Ops); } CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) { @@ -524,10 +494,7 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) { Value *Ops[] = {Size, Ptr}; // Fill in the single overloaded type: memory object type. Type *ObjectPtr[1] = {Ptr->getType()}; - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::invariant_start, ObjectPtr); - return CreateCall(TheFn, Ops); + return CreateIntrinsic(Intrinsic::invariant_start, ObjectPtr, Ops); } static MaybeAlign getAlign(Value *Ptr) { @@ -563,10 +530,8 @@ IRBuilderBase::CreateAssumption(Value *Cond, } Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) { - Module *M = BB->getModule(); - auto *FnIntrinsic = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::experimental_noalias_scope_decl, {}); - return CreateCall(FnIntrinsic, {Scope}); + return CreateIntrinsic(Intrinsic::experimental_noalias_scope_decl, {}, + {Scope}); } /// Create a call to a Masked Load intrinsic. @@ -616,9 +581,7 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops, ArrayRef OverloadedTypes, const Twine &Name) { - Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Id, OverloadedTypes); - return CreateCall(TheFn, Ops, {}, Name); + return CreateIntrinsic(Id, OverloadedTypes, Ops, {}, Name); } /// Create a call to a Masked Gather intrinsic. @@ -883,42 +846,34 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke( CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint, Type *ResultType, const Twine &Name) { Intrinsic::ID ID = Intrinsic::experimental_gc_result; - Module *M = BB->getParent()->getParent(); Type *Types[] = {ResultType}; - Function *FnGCResult = Intrinsic::getOrInsertDeclaration(M, ID, Types); Value *Args[] = {Statepoint}; - return CreateCall(FnGCResult, Args, {}, Name); + return CreateIntrinsic(ID, Types, Args, {}, Name); } CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name) { - Module *M = BB->getParent()->getParent(); Type *Types[] = {ResultType}; - Function *FnGCRelocate = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::experimental_gc_relocate, Types); Value *Args[] = {Statepoint, getInt32(BaseOffset), getInt32(DerivedOffset)}; - return CreateCall(FnGCRelocate, Args, {}, Name); + return CreateIntrinsic(Intrinsic::experimental_gc_relocate, Types, Args, {}, + Name); } CallInst *IRBuilderBase::CreateGCGetPointerBase(Value *DerivedPtr, const Twine &Name) { - Module *M = BB->getParent()->getParent(); Type *PtrTy = DerivedPtr->getType(); - Function *FnGCFindBase = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::experimental_gc_get_pointer_base, {PtrTy, PtrTy}); - return CreateCall(FnGCFindBase, {DerivedPtr}, {}, Name); + return CreateIntrinsic(Intrinsic::experimental_gc_get_pointer_base, + {PtrTy, PtrTy}, {DerivedPtr}, {}, Name); } CallInst *IRBuilderBase::CreateGCGetPointerOffset(Value *DerivedPtr, const Twine &Name) { - Module *M = BB->getParent()->getParent(); Type *PtrTy = DerivedPtr->getType(); - Function *FnGCGetOffset = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::experimental_gc_get_pointer_offset, {PtrTy}); - return CreateCall(FnGCGetOffset, {DerivedPtr}, {}, Name); + return CreateIntrinsic(Intrinsic::experimental_gc_get_pointer_offset, {PtrTy}, + {DerivedPtr}, {}, Name); } CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, @@ -1236,13 +1191,10 @@ Value *IRBuilderBase::CreatePreserveArrayAccessIndex( Type *ResultType = GetElementPtrInst::getGEPReturnType(Base, IdxList); - Module *M = BB->getParent()->getParent(); - Function *FnPreserveArrayAccessIndex = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::preserve_array_access_index, {ResultType, BaseType}); - Value *DimV = getInt32(Dimension); CallInst *Fn = - CreateCall(FnPreserveArrayAccessIndex, {Base, DimV, LastIndexV}); + CreateIntrinsic(Intrinsic::preserve_array_access_index, + {ResultType, BaseType}, {Base, DimV, LastIndexV}); Fn->addParamAttr( 0, Attribute::get(Fn->getContext(), Attribute::ElementType, ElTy)); if (DbgInfo) @@ -1257,13 +1209,9 @@ Value *IRBuilderBase::CreatePreserveUnionAccessIndex( "Invalid Base ptr type for preserve.union.access.index."); auto *BaseType = Base->getType(); - Module *M = BB->getParent()->getParent(); - Function *FnPreserveUnionAccessIndex = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::preserve_union_access_index, {BaseType, BaseType}); - Value *DIIndex = getInt32(FieldIndex); - CallInst *Fn = - CreateCall(FnPreserveUnionAccessIndex, {Base, DIIndex}); + CallInst *Fn = CreateIntrinsic(Intrinsic::preserve_union_access_index, + {BaseType, BaseType}, {Base, DIIndex}); if (DbgInfo) Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo); @@ -1282,13 +1230,10 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex( Type *ResultType = GetElementPtrInst::getGEPReturnType(Base, {Zero, GEPIndex}); - Module *M = BB->getParent()->getParent(); - Function *FnPreserveStructAccessIndex = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::preserve_struct_access_index, {ResultType, BaseType}); - Value *DIIndex = getInt32(FieldIndex); - CallInst *Fn = CreateCall(FnPreserveStructAccessIndex, - {Base, GEPIndex, DIIndex}); + CallInst *Fn = + CreateIntrinsic(Intrinsic::preserve_struct_access_index, + {ResultType, BaseType}, {Base, GEPIndex, DIIndex}); Fn->addParamAttr( 0, Attribute::get(Fn->getContext(), Attribute::ElementType, ElTy)); if (DbgInfo) @@ -1299,10 +1244,8 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex( Value *IRBuilderBase::createIsFPClass(Value *FPNum, unsigned Test) { ConstantInt *TestV = getInt32(Test); - Module *M = BB->getParent()->getParent(); - Function *FnIsFPClass = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::is_fpclass, {FPNum->getType()}); - return CreateCall(FnIsFPClass, {FPNum, TestV}); + return CreateIntrinsic(Intrinsic::is_fpclass, {FPNum->getType()}, + {FPNum, TestV}); } CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL, diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index ff8b4b7a020c2ffd5d4798fe504095bf9cfcfa3d..1b92daf15b463edb36a2191a3d2a0e36631b8fc9 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -724,6 +724,16 @@ Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id, .getCallee()); } +Function *Intrinsic::getDeclarationIfExists(const Module *M, ID id) { + return M->getFunction(getName(id)); +} + +Function *Intrinsic::getDeclarationIfExists(Module *M, ID id, + ArrayRef Tys, + FunctionType *FT) { + return M->getFunction(getName(id, Tys, M, FT)); +} + // This defines the "Intrinsic::getIntrinsicForClangBuiltin()" method. #define GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN #include "llvm/IR/IntrinsicImpl.inc" diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index 96e2f1d7908ba65859012be40d663498419602d5..ce6f6c733f4bcc8b8e0dabd7bb6f83f5495c5a9c 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -104,15 +104,13 @@ void PMDataManager::emitInstrCountChangedRemark( [&FunctionToInstrCount](Function &MaybeChangedFn) { // Update the total module count. unsigned FnSize = MaybeChangedFn.getInstructionCount(); - auto It = FunctionToInstrCount.find(MaybeChangedFn.getName()); // If we created a new function, then we need to add it to the map and // say that it changed from 0 instructions to FnSize. - if (It == FunctionToInstrCount.end()) { - FunctionToInstrCount[MaybeChangedFn.getName()] = - std::pair(0, FnSize); + auto [It, Inserted] = FunctionToInstrCount.try_emplace( + MaybeChangedFn.getName(), 0, FnSize); + if (Inserted) return; - } // Insert the new function size into the second member of the pair. This // tells us whether or not this function changed in size. It->second.second = FnSize; diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index d806f8093459eec250bf2bbce929fa6de94b7148..06167559a77697d98621c31045cf5a0b02a4c0c2 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -49,6 +49,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { setLibcallName(RTLIB::ASIN_F128, "asinf128"); setLibcallName(RTLIB::ACOS_F128, "acosf128"); setLibcallName(RTLIB::ATAN_F128, "atanf128"); + setLibcallName(RTLIB::ATAN2_F128, "atan2f128"); setLibcallName(RTLIB::SINH_F128, "sinhf128"); setLibcallName(RTLIB::COSH_F128, "coshf128"); setLibcallName(RTLIB::TANH_F128, "tanhf128"); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 2c7c3555716fddd03165b322b02e91ad5848b49d..f55bcd380c5d8e2c061b8f60948561c3a4bc8865 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2012,7 +2012,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty, Attrs.hasAttribute(Attribute::ReadOnly)), "Attributes writable and readonly are incompatible!", V); - AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty); + AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty, Attrs); for (Attribute Attr : Attrs) { if (!Attr.isStringAttribute() && IncompatibleAttrs.contains(Attr.getKindAsEnum())) { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 90c4e2c3cd131c994fda5b5f6c03828c6574a0f5..0f53c6085121719e964e4f3aee305f0278609f5a 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1120,13 +1120,13 @@ Error LTO::checkPartiallySplit() { if (!ThinLTO.CombinedIndex.partiallySplitLTOUnits()) return Error::success(); - Function *TypeTestFunc = RegularLTO.CombinedModule->getFunction( - Intrinsic::getName(Intrinsic::type_test)); - Function *TypeCheckedLoadFunc = RegularLTO.CombinedModule->getFunction( - Intrinsic::getName(Intrinsic::type_checked_load)); - Function *TypeCheckedLoadRelativeFunc = - RegularLTO.CombinedModule->getFunction( - Intrinsic::getName(Intrinsic::type_checked_load_relative)); + const Module *Combined = RegularLTO.CombinedModule.get(); + Function *TypeTestFunc = + Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_test); + Function *TypeCheckedLoadFunc = + Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_checked_load); + Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists( + Combined, Intrinsic::type_checked_load_relative); // First check if there are type tests / type checked loads in the // merged regular LTO module IR. diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp index eac78069f4d2bcf66147e16103fa2b1152b68cf7..00eb8adb4e10357b473f589dcdc181d5ed2ef0df 100644 --- a/llvm/lib/LTO/LTOModule.cpp +++ b/llvm/lib/LTO/LTOModule.cpp @@ -406,11 +406,16 @@ void LTOModule::addDefinedFunctionSymbol(ModuleSymbolTable::Symbol Sym) { Buffer.c_str(); } - const Function *F = cast(cast(Sym)); - addDefinedFunctionSymbol(Buffer, F); + auto *GV = cast(Sym); + assert((isa(GV) || + (isa(GV) && + isa(cast(GV)->getAliasee()))) && + "Not function or function alias"); + + addDefinedFunctionSymbol(Buffer, GV); } -void LTOModule::addDefinedFunctionSymbol(StringRef Name, const Function *F) { +void LTOModule::addDefinedFunctionSymbol(StringRef Name, const GlobalValue *F) { // add to list of defined symbols addDefinedSymbol(Name, F, true); } @@ -611,7 +616,11 @@ void LTOModule::parseSymbols() { } assert(isa(GV)); - addDefinedDataSymbol(Sym); + + if (isa(cast(GV)->getAliasee())) + addDefinedFunctionSymbol(Sym); + else + addDefinedDataSymbol(Sym); } // make symbols for all undefines diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index c9957e493264d6892d02ace173b0048b246c5735..05e997308e92a5cf300c9d2b80e735cbf2b1c6b5 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -82,6 +82,7 @@ #include "llvm/CodeGen/CodeGenPrepare.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" #include "llvm/CodeGen/DwarfEHPrepare.h" +#include "llvm/CodeGen/EarlyIfConversion.h" #include "llvm/CodeGen/ExpandLargeDivRem.h" #include "llvm/CodeGen/ExpandLargeFpConvert.h" #include "llvm/CodeGen/ExpandMemCmp.h" @@ -109,6 +110,7 @@ #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/PHIElimination.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp index 461fc43d32f8df223ad1ca4165754636ff3de945..8881bffe41c57c6e4f96fd9262e91a2fd658393e 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp @@ -18,14 +18,12 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" -#include "llvm/BinaryFormat/Wasm.h" #include "llvm/Object/Archive.h" #include "llvm/Object/Binary.h" #include "llvm/Object/COFF.h" #include "llvm/Object/Error.h" #include "llvm/Object/MachOUniversal.h" #include "llvm/Object/ObjectFile.h" -#include "llvm/Object/Wasm.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compression.h" @@ -1079,53 +1077,6 @@ lookupSections(ObjectFile &OF, InstrProfSectKind IPSK) { return Sections; } -/// Find a section that matches \p Name and is allocatable at runtime. -/// -/// Returns the contents of the section and its start offset in the object file. -static Expected> -lookupAllocatableSection(ObjectFile &OF, InstrProfSectKind IPSK) { - // On Wasm, allocatable sections can live only in data segments. - if (auto *WOF = dyn_cast(&OF)) { - std::vector Segments; - auto ObjFormat = OF.getTripleObjectFormat(); - auto Name = - getInstrProfSectionName(IPSK, ObjFormat, /*AddSegmentInfo=*/false); - for (const auto &DebugName : WOF->debugNames()) { - if (DebugName.Type != wasm::NameType::DATA_SEGMENT || - DebugName.Name != Name) - continue; - if (DebugName.Index >= WOF->dataSegments().size()) - return make_error(coveragemap_error::malformed); - auto &Segment = WOF->dataSegments()[DebugName.Index]; - Segments.push_back(&Segment); - } - if (Segments.empty()) - return make_error(coveragemap_error::no_data_found); - if (Segments.size() != 1) - return make_error(coveragemap_error::malformed); - - const auto &Segment = *Segments.front(); - auto &Data = Segment.Data; - StringRef Content(reinterpret_cast(Data.Content.data()), - Data.Content.size()); - return std::make_pair(Content, Segment.SectionOffset); - } - - // On other object file types, delegate to lookupSections to find the section. - auto Sections = lookupSections(OF, IPSK); - if (!Sections) - return Sections.takeError(); - if (Sections->size() != 1) - return make_error( - coveragemap_error::malformed, - "the size of coverage mapping section is not one"); - auto &Section = Sections->front(); - auto ContentsOrErr = Section.getContents(); - if (!ContentsOrErr) - return ContentsOrErr.takeError(); - return std::make_pair(*ContentsOrErr, Section.getAddress()); -} - static Expected> loadBinaryFormat(std::unique_ptr Bin, StringRef Arch, StringRef CompilationDir = "", @@ -1156,20 +1107,23 @@ loadBinaryFormat(std::unique_ptr Bin, StringRef Arch, // Look for the sections that we are interested in. auto ProfileNames = std::make_unique(); + std::vector NamesSectionRefs; // If IPSK_name is not found, fallback to search for IPK_covname, which is // used when binary correlation is enabled. - auto NamesSection = lookupAllocatableSection(*OF, IPSK_name); + auto NamesSection = lookupSections(*OF, IPSK_name); if (auto E = NamesSection.takeError()) { consumeError(std::move(E)); - NamesSection = lookupAllocatableSection(*OF, IPSK_covname); + NamesSection = lookupSections(*OF, IPSK_covname); if (auto E = NamesSection.takeError()) return std::move(E); } + NamesSectionRefs = *NamesSection; - uint64_t NamesAddress; - StringRef NamesContent; - std::tie(NamesContent, NamesAddress) = *NamesSection; - if (Error E = ProfileNames->create(NamesContent, NamesAddress)) + if (NamesSectionRefs.size() != 1) + return make_error( + coveragemap_error::malformed, + "the size of coverage mapping section is not one"); + if (Error E = ProfileNames->create(NamesSectionRefs.back())) return std::move(E); auto CoverageSection = lookupSections(*OF, IPSK_covmap); diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp index 23e365f16d8fea7a53116ff1599a22ee694245f8..ea8295f95c751a878ddf436c2f81685a56384e6e 100644 --- a/llvm/lib/Support/APInt.cpp +++ b/llvm/lib/Support/APInt.cpp @@ -2227,7 +2227,7 @@ void APInt::toString(SmallVectorImpl &Str, unsigned Radix, bool Signed, while (*Prefix) { Str.push_back(*Prefix); ++Prefix; - }; + } // We insert the digits backward, then reverse them to get the right order. unsigned StartDig = Str.size(); diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp index 7eb10887940513d65f1eda3e85ed41d4f0e87adf..f3e8d0a7fe6f3311741f06679463654e712ed848 100644 --- a/llvm/lib/Support/FormatVariadic.cpp +++ b/llvm/lib/Support/FormatVariadic.cpp @@ -64,11 +64,10 @@ static std::optional parseReplacementItem(StringRef Spec) { AlignStyle Where = AlignStyle::Right; StringRef Options; unsigned Index = ~0U; - RepString = RepString.trim(); + RepString = RepString.ltrim(); // If index is not specified, keep it ~0U to indicate unresolved index. RepString.consumeInteger(0, Index); - RepString = RepString.trim(); if (RepString.consume_front(",")) { if (!consumeFieldLayout(RepString, Where, Align, Pad)) { @@ -76,9 +75,9 @@ static std::optional parseReplacementItem(StringRef Spec) { return std::nullopt; } } - RepString = RepString.trim(); + RepString = RepString.ltrim(); if (RepString.consume_front(":")) { - Options = RepString.trim(); + Options = RepString; RepString = StringRef(); } RepString = RepString.trim(); diff --git a/llvm/lib/Support/rpmalloc/CACHE.md b/llvm/lib/Support/rpmalloc/CACHE.md index 052320baf532757e3856b08e0cd2bee29f3e86df..645093026debf17a312802ab367f4b12e167200f 100644 --- a/llvm/lib/Support/rpmalloc/CACHE.md +++ b/llvm/lib/Support/rpmalloc/CACHE.md @@ -1,19 +1,19 @@ -# Thread caches -rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let spans of memory pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc. - -The rpmalloc thread cache is configured to be unlimited, performance oriented as meaning default values, size oriented where both thread cache and global cache is reduced significantly, or disabled where both thread and global caches are disabled and completely free pages are directly unmapped. - -The benchmark is configured to run threads allocating 150000 blocks distributed in the `[16, 16000]` bytes range with a linear falloff probability. It runs 1000 loops, and every iteration 75000 blocks (50%) are freed and allocated in a scattered pattern. There are no cross thread allocations/deallocations. Parameters: `benchmark n 0 0 0 1000 150000 75000 16 16000`. The benchmarks are run on an Ubuntu 16.10 machine with 8 cores (4 physical, HT) and 12GiB RAM. - -The benchmark also includes results for the standard library malloc implementation as a reference for comparison with the nocache setting. - -![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=387883204&format=image) -![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=1644710241&format=image) - -For single threaded case the unlimited cache and performance oriented cache settings have identical performance and memory overhead, indicating that the memory pages fit in the combined thread and global cache. As number of threads increase to 2-4 threads, the performance settings have slightly higher performance which can seem odd at first, but can be explained by low contention on the global cache where some memory pages can flow between threads without stalling, reducing the overall number of calls to map new memory pages (also indicated by the slightly lower memory overhead). - -As threads increase even more to 5-10 threads, the increased contention and eventual limit of global cache cause the unlimited setting to gain a slight advantage in performance. As expected the memory overhead remains constant for unlimited caches, while going down for performance setting when number of threads increases. - -The size oriented setting maintain good performance compared to the standard library while reducing the memory overhead compared to the performance setting with a decent amount. - -The nocache setting still outperforms the reference standard library allocator for workloads up to 6 threads while maintaining a near zero memory overhead, which is even slightly lower than the standard library. For use case scenarios where number of allocation of each size class is lower the overhead in rpmalloc from the 64KiB span size will of course increase. +# Thread caches +rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let spans of memory pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc. + +The rpmalloc thread cache is configured to be unlimited, performance oriented as meaning default values, size oriented where both thread cache and global cache is reduced significantly, or disabled where both thread and global caches are disabled and completely free pages are directly unmapped. + +The benchmark is configured to run threads allocating 150000 blocks distributed in the `[16, 16000]` bytes range with a linear falloff probability. It runs 1000 loops, and every iteration 75000 blocks (50%) are freed and allocated in a scattered pattern. There are no cross thread allocations/deallocations. Parameters: `benchmark n 0 0 0 1000 150000 75000 16 16000`. The benchmarks are run on an Ubuntu 16.10 machine with 8 cores (4 physical, HT) and 12GiB RAM. + +The benchmark also includes results for the standard library malloc implementation as a reference for comparison with the nocache setting. + +![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=387883204&format=image) +![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=1644710241&format=image) + +For single threaded case the unlimited cache and performance oriented cache settings have identical performance and memory overhead, indicating that the memory pages fit in the combined thread and global cache. As number of threads increase to 2-4 threads, the performance settings have slightly higher performance which can seem odd at first, but can be explained by low contention on the global cache where some memory pages can flow between threads without stalling, reducing the overall number of calls to map new memory pages (also indicated by the slightly lower memory overhead). + +As threads increase even more to 5-10 threads, the increased contention and eventual limit of global cache cause the unlimited setting to gain a slight advantage in performance. As expected the memory overhead remains constant for unlimited caches, while going down for performance setting when number of threads increases. + +The size oriented setting maintain good performance compared to the standard library while reducing the memory overhead compared to the performance setting with a decent amount. + +The nocache setting still outperforms the reference standard library allocator for workloads up to 6 threads while maintaining a near zero memory overhead, which is even slightly lower than the standard library. For use case scenarios where number of allocation of each size class is lower the overhead in rpmalloc from the 64KiB span size will of course increase. diff --git a/llvm/lib/Support/rpmalloc/README.md b/llvm/lib/Support/rpmalloc/README.md index 916bca0118d868695c154de526944cb6353deee8..2233df9da42d524c90728038b3961f6e7ebaeb26 100644 --- a/llvm/lib/Support/rpmalloc/README.md +++ b/llvm/lib/Support/rpmalloc/README.md @@ -1,220 +1,220 @@ -# rpmalloc - General Purpose Memory Allocator -This library provides a cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. -This is a fork of rpmalloc 1.4.5. - -Platforms currently supported: - -- Windows -- MacOS -- iOS -- Linux -- Android -- Haiku - -The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation and mapping granularity/size. - -This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license. - -# Performance -We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment. - -Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments. - -https://github.com/mjansson/rpmalloc-benchmark - -Below is an example performance comparison chart of rpmalloc and other popular allocator implementations, with default configurations used. - -![Ubuntu 16.10, random [16, 8000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=301017877&format=image) - -The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8 logical cores (4 physical, HT). The actual numbers are not to be interpreted as absolute performance figures, but rather as relative comparisons between the different allocators. For additional benchmark results, see the [BENCHMARKS](BENCHMARKS.md) file. - -Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines. - -# Required functions - -Before calling any other function in the API, you __MUST__ call the initialization function, either __rpmalloc_initialize__ or __rpmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point. - -Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case. - -# Using -The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads: - -__rpmalloc_initialize__ : Call at process start to initialize the allocator - -__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend, memory page size and mapping granularity. - -__rpmalloc_finalize__: Call at process exit to finalize the allocator - -__rpmalloc_thread_initialize__: Call at each thread start to initialize the thread local data for the allocator - -__rpmalloc_thread_finalize__: Call at each thread exit to finalize and release thread cache back to global cache - -__rpmalloc_config__: Get the current runtime configuration of the allocator - -Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replacement functions. Remember all allocations are 16-byte aligned, so no need to call the explicit rpmemalign/rpaligned_alloc/rpposix_memalign functions unless you need greater alignment, they are simply wrappers to make it easier to replace in existing code. - -If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__, and then rebuild the library or your project where you added the rpmalloc source. If you compile rpmalloc as a separate library you must make the linker use the override symbols from the library by referencing at least one symbol. The easiest way is to simply include `rpmalloc.h` in at least one source file and call `rpmalloc_linker_reference` somewhere - it's a dummy empty function. On Windows platforms and C++ overrides you have to `#include ` in at least one source file and also manually handle the initialize/finalize of the process and all threads. The list of libc entry points replaced may not be complete, use libc/stdc++ replacement only as a convenience for testing the library on an existing code base, not a final solution. - -For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1. - -# Building -To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides. - -The configure + ninja build also produces two shared object/dynamic libraries. The `rpmallocwrap` shared library can be used with LD_PRELOAD/DYLD_INSERT_LIBRARIES to inject in a preexisting binary, replacing any malloc/free family of function calls. This is only implemented for Linux and macOS targets. The list of libc entry points replaced may not be complete, use preloading as a convenience for testing the library on an existing binary, not a final solution. The dynamic library also provides automatic init/fini of process and threads for all platforms. - -The latest stable release is available in the master branch. For latest development code, use the develop branch. - -# Cache configuration options -Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by editing the individual defines in the `rpmalloc.c` source file for fine tuned control. - -__ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead. - -__ENABLE_UNLIMITED_GLOBAL_CACHE__: By default defined to 0, set to 1 to make global caches infinite, i.e never unmap memory pages back to the OS. - -__ENABLE_UNLIMITED_THREAD_CACHE__: By default defined to 0, set to 1 to make thread caches infinite, i.e never release spans to global cache unless thread finishes. - -__ENABLE_GLOBAL_CACHE__: By default defined to 1, enables the global cache shared between all threads. Set to 0 to disable the global cache and directly unmap pages evicted from the thread cache. - -__ENABLE_THREAD_CACHE__: By default defined to 1, enables the per-thread cache. Set to 0 to disable the thread cache and directly unmap pages no longer in use (also disables the global cache). - -__ENABLE_ADAPTIVE_THREAD_CACHE__: Introduces a simple heuristics in the thread cache size, keeping 25% of the high water mark for each span count class. - -# Other configuration options -Detailed statistics are available if __ENABLE_STATISTICS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. This will cause a slight overhead in runtime to collect statistics for each memory operation, and will also add 4 bytes overhead per allocation to track sizes. - -Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. If enabled, size arguments to the global entry points are verified not to cause integer overflows in calculations. - -Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. - -To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1. - -To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB. - -To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled. - -# Huge pages -The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done. - -# Quick overview -The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages. - -# Implementation details -The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits). - -Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 1024] bytes, medium blocks (1024, 32256] bytes, and large blocks (32256, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (1024, span size] bytes. - -Small blocks have a size class granularity of 16 bytes each in 64 buckets. Medium blocks have a granularity of 512 bytes, 61 buckets (default). Large blocks have the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 36 bytes will allocate a block of 48 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested. - -Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans. The fourth level is a global list of free spans. - -Each span for a small and medium size class keeps track of how many blocks are allocated/free, as well as a list of which blocks that are free for allocation. To avoid locks, each span is completely owned by the allocating thread, and all cross-thread deallocations will be deferred to the owner thread through a separate free list per span. - -Large blocks, or super spans, are cached in two levels. The first level is a per thread list of free super spans. The second level is a global list of free super spans. - -# Memory mapping -By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes. - -The returned memory address from the memory map function MUST be aligned to the memory page size and the memory span size (which ever is larger), both of which is configurable. Either provide the page and span sizes during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the maximum of page and span size. The span size MUST be a power of two in [4096, 262144] range, and be a multiple or divisor of the memory page size. - -Memory mapping requests are always done in multiples of the memory page size. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two. - -To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 64). If the memory page size is larger than the span size, the number of spans to map in a single call will be adjusted to guarantee a multiple of the page size, and the spans will be kept mapped until the entire span range can be unmapped in one call (to avoid trying to unmap partial pages). - -On macOS and iOS mmap requests are tagged with tag 240 for easy identification with the vmmap tool. - -# Span breaking -Super spans (spans a multiple > 1 of the span size) can be subdivided into smaller spans to fulfill a need to map a new span of memory. By default the allocator will greedily grab and break any larger span from the available caches before mapping new virtual memory. However, spans can currently not be glued together to form larger super spans again. Subspans can traverse the cache and be used by different threads individually. - -A span that is a subspan of a larger super span can be individually decommitted to reduce physical memory pressure when the span is evicted from caches and scheduled to be unmapped. The entire original super span will keep track of the subspans it is broken up into, and when the entire range is decommitted the super span will be unmapped. This allows platforms like Windows that require the entire virtual memory range that was mapped in a call to VirtualAlloc to be unmapped in one call to VirtualFree, while still decommitting individual pages in subspans (if the page size is smaller than the span size). - -If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting individual pages and the total super span byte size for finally releasing the entire super span memory range. - -# Memory fragmentation -There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class. - -However, there is memory fragmentation in the meaning that a request for x bytes followed by a request of y bytes where x and y are at least one size class different in size will return blocks that are at least one memory page apart in virtual address space. Only blocks of the same size will potentially be within the same memory page span. - -rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span. - -# First class heaps -rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time. - -# Producer-consumer scenario -Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads. - -# Best case scenarios -Threads that keep ownership of allocated memory blocks within the thread and free the blocks from the same thread will have optimal performance. - -Threads that have allocation patterns where the difference in memory usage high and low water marks fit within the thread cache thresholds in the allocator will never touch the global cache except during thread init/fini and have optimal performance. Tweaking the cache limits can be done on a per-size-class basis. - -# Worst case scenarios -Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (16, 32, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 4KiB (or the configured memory page size) per size class as each span is initialized one memory page at a time. The cache for free spans will be reused by all size classes. - -Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead). - -# Caveats -VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space. - -All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__. - -To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks. - -# Other languages - -[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs) - -[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp) - -# License - -This is free and unencumbered software released into the public domain. - -Anyone is free to copy, modify, publish, use, compile, sell, or -distribute this software, either in source code form or as a compiled -binary, for any purpose, commercial or non-commercial, and by any -means. - -In jurisdictions that recognize copyright laws, the author or authors -of this software dedicate any and all copyright interest in the -software to the public domain. We make this dedication for the benefit -of the public at large and to the detriment of our heirs and -successors. We intend this dedication to be an overt act of -relinquishment in perpetuity of all present and future rights to this -software under copyright law. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - -For more information, please refer to - - -You can also use this software under the MIT license if public domain is -not recognized in your country - - -The MIT License (MIT) - -Copyright (c) 2017 Mattias Jansson - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +# rpmalloc - General Purpose Memory Allocator +This library provides a cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. +This is a fork of rpmalloc 1.4.5. + +Platforms currently supported: + +- Windows +- MacOS +- iOS +- Linux +- Android +- Haiku + +The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation and mapping granularity/size. + +This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license. + +# Performance +We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment. + +Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments. + +https://github.com/mjansson/rpmalloc-benchmark + +Below is an example performance comparison chart of rpmalloc and other popular allocator implementations, with default configurations used. + +![Ubuntu 16.10, random [16, 8000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=301017877&format=image) + +The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8 logical cores (4 physical, HT). The actual numbers are not to be interpreted as absolute performance figures, but rather as relative comparisons between the different allocators. For additional benchmark results, see the [BENCHMARKS](BENCHMARKS.md) file. + +Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines. + +# Required functions + +Before calling any other function in the API, you __MUST__ call the initialization function, either __rpmalloc_initialize__ or __rpmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point. + +Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case. + +# Using +The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads: + +__rpmalloc_initialize__ : Call at process start to initialize the allocator + +__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend, memory page size and mapping granularity. + +__rpmalloc_finalize__: Call at process exit to finalize the allocator + +__rpmalloc_thread_initialize__: Call at each thread start to initialize the thread local data for the allocator + +__rpmalloc_thread_finalize__: Call at each thread exit to finalize and release thread cache back to global cache + +__rpmalloc_config__: Get the current runtime configuration of the allocator + +Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replacement functions. Remember all allocations are 16-byte aligned, so no need to call the explicit rpmemalign/rpaligned_alloc/rpposix_memalign functions unless you need greater alignment, they are simply wrappers to make it easier to replace in existing code. + +If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__, and then rebuild the library or your project where you added the rpmalloc source. If you compile rpmalloc as a separate library you must make the linker use the override symbols from the library by referencing at least one symbol. The easiest way is to simply include `rpmalloc.h` in at least one source file and call `rpmalloc_linker_reference` somewhere - it's a dummy empty function. On Windows platforms and C++ overrides you have to `#include ` in at least one source file and also manually handle the initialize/finalize of the process and all threads. The list of libc entry points replaced may not be complete, use libc/stdc++ replacement only as a convenience for testing the library on an existing code base, not a final solution. + +For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1. + +# Building +To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides. + +The configure + ninja build also produces two shared object/dynamic libraries. The `rpmallocwrap` shared library can be used with LD_PRELOAD/DYLD_INSERT_LIBRARIES to inject in a preexisting binary, replacing any malloc/free family of function calls. This is only implemented for Linux and macOS targets. The list of libc entry points replaced may not be complete, use preloading as a convenience for testing the library on an existing binary, not a final solution. The dynamic library also provides automatic init/fini of process and threads for all platforms. + +The latest stable release is available in the master branch. For latest development code, use the develop branch. + +# Cache configuration options +Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by editing the individual defines in the `rpmalloc.c` source file for fine tuned control. + +__ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead. + +__ENABLE_UNLIMITED_GLOBAL_CACHE__: By default defined to 0, set to 1 to make global caches infinite, i.e never unmap memory pages back to the OS. + +__ENABLE_UNLIMITED_THREAD_CACHE__: By default defined to 0, set to 1 to make thread caches infinite, i.e never release spans to global cache unless thread finishes. + +__ENABLE_GLOBAL_CACHE__: By default defined to 1, enables the global cache shared between all threads. Set to 0 to disable the global cache and directly unmap pages evicted from the thread cache. + +__ENABLE_THREAD_CACHE__: By default defined to 1, enables the per-thread cache. Set to 0 to disable the thread cache and directly unmap pages no longer in use (also disables the global cache). + +__ENABLE_ADAPTIVE_THREAD_CACHE__: Introduces a simple heuristics in the thread cache size, keeping 25% of the high water mark for each span count class. + +# Other configuration options +Detailed statistics are available if __ENABLE_STATISTICS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. This will cause a slight overhead in runtime to collect statistics for each memory operation, and will also add 4 bytes overhead per allocation to track sizes. + +Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. If enabled, size arguments to the global entry points are verified not to cause integer overflows in calculations. + +Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. + +To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1. + +To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB. + +To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled. + +# Huge pages +The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done. + +# Quick overview +The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages. + +# Implementation details +The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits). + +Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 1024] bytes, medium blocks (1024, 32256] bytes, and large blocks (32256, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (1024, span size] bytes. + +Small blocks have a size class granularity of 16 bytes each in 64 buckets. Medium blocks have a granularity of 512 bytes, 61 buckets (default). Large blocks have the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 36 bytes will allocate a block of 48 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested. + +Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans. The fourth level is a global list of free spans. + +Each span for a small and medium size class keeps track of how many blocks are allocated/free, as well as a list of which blocks that are free for allocation. To avoid locks, each span is completely owned by the allocating thread, and all cross-thread deallocations will be deferred to the owner thread through a separate free list per span. + +Large blocks, or super spans, are cached in two levels. The first level is a per thread list of free super spans. The second level is a global list of free super spans. + +# Memory mapping +By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes. + +The returned memory address from the memory map function MUST be aligned to the memory page size and the memory span size (which ever is larger), both of which is configurable. Either provide the page and span sizes during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the maximum of page and span size. The span size MUST be a power of two in [4096, 262144] range, and be a multiple or divisor of the memory page size. + +Memory mapping requests are always done in multiples of the memory page size. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two. + +To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 64). If the memory page size is larger than the span size, the number of spans to map in a single call will be adjusted to guarantee a multiple of the page size, and the spans will be kept mapped until the entire span range can be unmapped in one call (to avoid trying to unmap partial pages). + +On macOS and iOS mmap requests are tagged with tag 240 for easy identification with the vmmap tool. + +# Span breaking +Super spans (spans a multiple > 1 of the span size) can be subdivided into smaller spans to fulfill a need to map a new span of memory. By default the allocator will greedily grab and break any larger span from the available caches before mapping new virtual memory. However, spans can currently not be glued together to form larger super spans again. Subspans can traverse the cache and be used by different threads individually. + +A span that is a subspan of a larger super span can be individually decommitted to reduce physical memory pressure when the span is evicted from caches and scheduled to be unmapped. The entire original super span will keep track of the subspans it is broken up into, and when the entire range is decommitted the super span will be unmapped. This allows platforms like Windows that require the entire virtual memory range that was mapped in a call to VirtualAlloc to be unmapped in one call to VirtualFree, while still decommitting individual pages in subspans (if the page size is smaller than the span size). + +If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting individual pages and the total super span byte size for finally releasing the entire super span memory range. + +# Memory fragmentation +There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class. + +However, there is memory fragmentation in the meaning that a request for x bytes followed by a request of y bytes where x and y are at least one size class different in size will return blocks that are at least one memory page apart in virtual address space. Only blocks of the same size will potentially be within the same memory page span. + +rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span. + +# First class heaps +rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time. + +# Producer-consumer scenario +Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads. + +# Best case scenarios +Threads that keep ownership of allocated memory blocks within the thread and free the blocks from the same thread will have optimal performance. + +Threads that have allocation patterns where the difference in memory usage high and low water marks fit within the thread cache thresholds in the allocator will never touch the global cache except during thread init/fini and have optimal performance. Tweaking the cache limits can be done on a per-size-class basis. + +# Worst case scenarios +Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (16, 32, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 4KiB (or the configured memory page size) per size class as each span is initialized one memory page at a time. The cache for free spans will be reused by all size classes. + +Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead). + +# Caveats +VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space. + +All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__. + +To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks. + +# Other languages + +[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs) + +[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp) + +# License + +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to + + +You can also use this software under the MIT license if public domain is +not recognized in your country + + +The MIT License (MIT) + +Copyright (c) 2017 Mattias Jansson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/llvm/lib/Support/rpmalloc/malloc.c b/llvm/lib/Support/rpmalloc/malloc.c index 3fcfe848250c6bf36930e77ab493f2a50c600cce..59e13aab3ef7ed82c5754d93e2c9cc5cf9c2913c 100644 --- a/llvm/lib/Support/rpmalloc/malloc.c +++ b/llvm/lib/Support/rpmalloc/malloc.c @@ -1,724 +1,724 @@ -//===------------------------ malloc.c ------------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -// -// This file provides overrides for the standard library malloc entry points for -// C and new/delete operators for C++ It also provides automatic -// initialization/finalization of process and threads -// -//===----------------------------------------------------------------------===// - -#if defined(__TINYC__) -#include -#endif - -#ifndef ARCH_64BIT -#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64) -#define ARCH_64BIT 1 -_Static_assert(sizeof(size_t) == 8, "Data type size mismatch"); -_Static_assert(sizeof(void *) == 8, "Data type size mismatch"); -#else -#define ARCH_64BIT 0 -_Static_assert(sizeof(size_t) == 4, "Data type size mismatch"); -_Static_assert(sizeof(void *) == 4, "Data type size mismatch"); -#endif -#endif - -#if (defined(__GNUC__) || defined(__clang__)) -#pragma GCC visibility push(default) -#endif - -#define USE_IMPLEMENT 1 -#define USE_INTERPOSE 0 -#define USE_ALIAS 0 - -#if defined(__APPLE__) -#undef USE_INTERPOSE -#define USE_INTERPOSE 1 - -typedef struct interpose_t { - void *new_func; - void *orig_func; -} interpose_t; - -#define MAC_INTERPOSE_PAIR(newf, oldf) {(void *)newf, (void *)oldf} -#define MAC_INTERPOSE_SINGLE(newf, oldf) \ - __attribute__((used)) static const interpose_t macinterpose##newf##oldf \ - __attribute__((section("__DATA, __interpose"))) = \ - MAC_INTERPOSE_PAIR(newf, oldf) - -#endif - -#if !defined(_WIN32) && !defined(__APPLE__) -#undef USE_IMPLEMENT -#undef USE_ALIAS -#define USE_IMPLEMENT 0 -#define USE_ALIAS 1 -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4100) -#undef malloc -#undef free -#undef calloc -#define RPMALLOC_RESTRICT __declspec(restrict) -#else -#define RPMALLOC_RESTRICT -#endif - -#if ENABLE_OVERRIDE - -typedef struct rp_nothrow_t { - int __dummy; -} rp_nothrow_t; - -#if USE_IMPLEMENT - -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL malloc(size_t size) { - return rpmalloc(size); -} -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL calloc(size_t count, - size_t size) { - return rpcalloc(count, size); -} -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL realloc(void *ptr, - size_t size) { - return rprealloc(ptr, size); -} -extern inline void *RPMALLOC_CDECL reallocf(void *ptr, size_t size) { - return rprealloc(ptr, size); -} -extern inline void *RPMALLOC_CDECL aligned_alloc(size_t alignment, - size_t size) { - return rpaligned_alloc(alignment, size); -} -extern inline void *RPMALLOC_CDECL memalign(size_t alignment, size_t size) { - return rpmemalign(alignment, size); -} -extern inline int RPMALLOC_CDECL posix_memalign(void **memptr, size_t alignment, - size_t size) { - return rpposix_memalign(memptr, alignment, size); -} -extern inline void RPMALLOC_CDECL free(void *ptr) { rpfree(ptr); } -extern inline void RPMALLOC_CDECL cfree(void *ptr) { rpfree(ptr); } -extern inline size_t RPMALLOC_CDECL malloc_usable_size(void *ptr) { - return rpmalloc_usable_size(ptr); -} -extern inline size_t RPMALLOC_CDECL malloc_size(void *ptr) { - return rpmalloc_usable_size(ptr); -} - -#ifdef _WIN32 -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _malloc_base(size_t size) { - return rpmalloc(size); -} -extern inline void RPMALLOC_CDECL _free_base(void *ptr) { rpfree(ptr); } -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _calloc_base(size_t count, - size_t size) { - return rpcalloc(count, size); -} -extern inline size_t RPMALLOC_CDECL _msize(void *ptr) { - return rpmalloc_usable_size(ptr); -} -extern inline size_t RPMALLOC_CDECL _msize_base(void *ptr) { - return rpmalloc_usable_size(ptr); -} -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL -_realloc_base(void *ptr, size_t size) { - return rprealloc(ptr, size); -} -#endif - -#ifdef _WIN32 -// For Windows, #include in one source file to get the C++ operator -// overrides implemented in your module -#else -// Overload the C++ operators using the mangled names -// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) operators -// delete and delete[] -#define RPDEFVIS __attribute__((visibility("default"))) -extern void _ZdlPv(void *p); -void RPDEFVIS _ZdlPv(void *p) { rpfree(p); } -extern void _ZdaPv(void *p); -void RPDEFVIS _ZdaPv(void *p) { rpfree(p); } -#if ARCH_64BIT -// 64-bit operators new and new[], normal and aligned -extern void *_Znwm(uint64_t size); -void *RPDEFVIS _Znwm(uint64_t size) { return rpmalloc(size); } -extern void *_Znam(uint64_t size); -void *RPDEFVIS _Znam(uint64_t size) { return rpmalloc(size); } -extern void *_Znwmm(uint64_t size, uint64_t align); -void *RPDEFVIS _Znwmm(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_Znamm(uint64_t size, uint64_t align); -void *RPDEFVIS _Znamm(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwmSt11align_val_t(uint64_t size, uint64_t align); -void *RPDEFVIS _ZnwmSt11align_val_t(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnamSt11align_val_t(uint64_t size, uint64_t align); -void *RPDEFVIS _ZnamSt11align_val_t(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -extern void *_ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -// 64-bit operators sized delete and delete[], normal and aligned -extern void _ZdlPvm(void *p, uint64_t size); -void RPDEFVIS _ZdlPvm(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdaPvm(void *p, uint64_t size); -void RPDEFVIS _ZdaPvm(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdlPvSt11align_val_t(void *p, uint64_t align); -void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t align) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdaPvSt11align_val_t(void *p, uint64_t align); -void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t align) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); -void RPDEFVIS _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(align); -} -extern void _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); -void RPDEFVIS _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(align); -} -#else -// 32-bit operators new and new[], normal and aligned -extern void *_Znwj(uint32_t size); -void *RPDEFVIS _Znwj(uint32_t size) { return rpmalloc(size); } -extern void *_Znaj(uint32_t size); -void *RPDEFVIS _Znaj(uint32_t size) { return rpmalloc(size); } -extern void *_Znwjj(uint32_t size, uint32_t align); -void *RPDEFVIS _Znwjj(uint32_t size, uint32_t align) { - return rpaligned_alloc(align, size); -} -extern void *_Znajj(uint32_t size, uint32_t align); -void *RPDEFVIS _Znajj(uint32_t size, uint32_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwjSt11align_val_t(size_t size, size_t align); -void *RPDEFVIS _ZnwjSt11align_val_t(size_t size, size_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnajSt11align_val_t(size_t size, size_t align); -void *RPDEFVIS _ZnajSt11align_val_t(size_t size, size_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -extern void *_ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -// 32-bit operators sized delete and delete[], normal and aligned -extern void _ZdlPvj(void *p, uint64_t size); -void RPDEFVIS _ZdlPvj(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdaPvj(void *p, uint64_t size); -void RPDEFVIS _ZdaPvj(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdlPvSt11align_val_t(void *p, uint32_t align); -void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t a) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdaPvSt11align_val_t(void *p, uint32_t align); -void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t a) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdlPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); -void RPDEFVIS _ZdlPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(a); -} -extern void _ZdaPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); -void RPDEFVIS _ZdaPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(a); -} -#endif -#endif -#endif - -#if USE_INTERPOSE || USE_ALIAS - -static void *rpmalloc_nothrow(size_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -static void *rpaligned_alloc_reverse(size_t size, size_t align) { - return rpaligned_alloc(align, size); -} -static void *rpaligned_alloc_reverse_nothrow(size_t size, size_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -static void rpfree_size(void *p, size_t size) { - (void)sizeof(size); - rpfree(p); -} -static void rpfree_aligned(void *p, size_t align) { - (void)sizeof(align); - rpfree(p); -} -static void rpfree_size_aligned(void *p, size_t size, size_t align) { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -#endif - -#if USE_INTERPOSE - -__attribute__((used)) static const interpose_t macinterpose_malloc[] - __attribute__((section("__DATA, __interpose"))) = { - // new and new[] - MAC_INTERPOSE_PAIR(rpmalloc, _Znwm), - MAC_INTERPOSE_PAIR(rpmalloc, _Znam), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znwmm), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znamm), - MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnwmRKSt9nothrow_t), - MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnamRKSt9nothrow_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnwmSt11align_val_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnamSt11align_val_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, - _ZnwmSt11align_val_tRKSt9nothrow_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, - _ZnamSt11align_val_tRKSt9nothrow_t), - // delete and delete[] - MAC_INTERPOSE_PAIR(rpfree, _ZdlPv), MAC_INTERPOSE_PAIR(rpfree, _ZdaPv), - MAC_INTERPOSE_PAIR(rpfree_size, _ZdlPvm), - MAC_INTERPOSE_PAIR(rpfree_size, _ZdaPvm), - MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdlPvSt11align_val_t), - MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdaPvSt11align_val_t), - MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdlPvmSt11align_val_t), - MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdaPvmSt11align_val_t), - // libc entry points - MAC_INTERPOSE_PAIR(rpmalloc, malloc), - MAC_INTERPOSE_PAIR(rpmalloc, calloc), - MAC_INTERPOSE_PAIR(rprealloc, realloc), - MAC_INTERPOSE_PAIR(rprealloc, reallocf), -#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15 - MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc), -#endif - MAC_INTERPOSE_PAIR(rpmemalign, memalign), - MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign), - MAC_INTERPOSE_PAIR(rpfree, free), MAC_INTERPOSE_PAIR(rpfree, cfree), - MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size), - MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size)}; - -#endif - -#if USE_ALIAS - -#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default"))); - -// Alias the C++ operators using the mangled names -// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) - -// operators delete and delete[] -void _ZdlPv(void *p) RPALIAS(rpfree) void _ZdaPv(void *p) RPALIAS(rpfree) - -#if ARCH_64BIT - // 64-bit operators new and new[], normal and aligned - void *_Znwm(uint64_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *_Znam(uint64_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwmm(uint64_t size, - uint64_t align) - RPALIAS(rpaligned_alloc_reverse) void *_Znamm(uint64_t size, - uint64_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwmSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnamSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwmRKSt9nothrow_t( - size_t size, rp_nothrow_t t) - RPALIAS(rpmalloc_nothrow) void *_ZnamRKSt9nothrow_t( - size_t size, - rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void - *_ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, - size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) void - *_ZnamSt11align_val_tRKSt9nothrow_t( - size_t size, size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) - // 64-bit operators delete and delete[], sized and aligned - void _ZdlPvm(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvm(void *p, - size_t n) - RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) - RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, - size_t a) - RPALIAS(rpfree_aligned) void _ZdlPvmSt11align_val_t(void *p, - size_t n, - size_t a) - RPALIAS(rpfree_size_aligned) void _ZdaPvmSt11align_val_t( - void *p, size_t n, size_t a) - RPALIAS(rpfree_size_aligned) -#else - // 32-bit operators new and new[], normal and aligned - void *_Znwj(uint32_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *_Znaj(uint32_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwjj(uint32_t size, - uint32_t align) - RPALIAS(rpaligned_alloc_reverse) void *_Znajj(uint32_t size, - uint32_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwjSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnajSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwjRKSt9nothrow_t( - size_t size, rp_nothrow_t t) - RPALIAS(rpmalloc_nothrow) void *_ZnajRKSt9nothrow_t( - size_t size, - rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void - *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, - size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) void - *_ZnajSt11align_val_tRKSt9nothrow_t( - size_t size, size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) - // 32-bit operators delete and delete[], sized and aligned - void _ZdlPvj(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvj(void *p, - size_t n) - RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) - RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, - size_t a) - RPALIAS(rpfree_aligned) void _ZdlPvjSt11align_val_t(void *p, - size_t n, - size_t a) - RPALIAS(rpfree_size_aligned) void _ZdaPvjSt11align_val_t( - void *p, size_t n, size_t a) - RPALIAS(rpfree_size_aligned) -#endif - - void *malloc(size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *calloc(size_t count, size_t size) - RPALIAS(rpcalloc) void *realloc(void *ptr, size_t size) - RPALIAS(rprealloc) void *reallocf(void *ptr, size_t size) - RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) - RPALIAS(rprealloc) void *aligned_alloc(size_t alignment, size_t size) - RPALIAS(rpaligned_alloc) void *memalign( - size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) - RPALIAS(rpmemalign) int posix_memalign(void **memptr, size_t alignment, - size_t size) - RPALIAS(rpposix_memalign) void free(void *ptr) - RPALIAS(rpfree) void cfree(void *ptr) RPALIAS(rpfree) -#if defined(__ANDROID__) || defined(__FreeBSD__) - size_t - malloc_usable_size(const void *ptr) RPALIAS(rpmalloc_usable_size) -#else - size_t - malloc_usable_size(void *ptr) RPALIAS(rpmalloc_usable_size) -#endif - size_t malloc_size(void *ptr) RPALIAS(rpmalloc_usable_size) - -#endif - - static inline size_t _rpmalloc_page_size(void) { - return _memory_page_size; -} - -extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size); - -extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#ifdef _MSC_VER - int err = SizeTMult(count, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(count, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = count * size; -#endif - return realloc(ptr, total); -} - -extern inline void *RPMALLOC_CDECL valloc(size_t size) { - get_thread_heap(); - return rpaligned_alloc(_rpmalloc_page_size(), size); -} - -extern inline void *RPMALLOC_CDECL pvalloc(size_t size) { - get_thread_heap(); - const size_t page_size = _rpmalloc_page_size(); - const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size; -#if ENABLE_VALIDATE_ARGS - if (aligned_size < size) { - errno = EINVAL; - return 0; - } -#endif - return rpaligned_alloc(_rpmalloc_page_size(), aligned_size); -} - -#endif // ENABLE_OVERRIDE - -#if ENABLE_PRELOAD - -#ifdef _WIN32 - -#if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK - -extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, - DWORD reason, LPVOID reserved); - -extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, - DWORD reason, - LPVOID reserved) { - (void)sizeof(reserved); - (void)sizeof(instance); - if (reason == DLL_PROCESS_ATTACH) - rpmalloc_initialize(); - else if (reason == DLL_PROCESS_DETACH) - rpmalloc_finalize(); - else if (reason == DLL_THREAD_ATTACH) - rpmalloc_thread_initialize(); - else if (reason == DLL_THREAD_DETACH) - rpmalloc_thread_finalize(1); - return TRUE; -} - -// end BUILD_DYNAMIC_LINK -#else - -extern void _global_rpmalloc_init(void) { - rpmalloc_set_main_thread(); - rpmalloc_initialize(); -} - -#if defined(__clang__) || defined(__GNUC__) - -static void __attribute__((constructor)) initializer(void) { - _global_rpmalloc_init(); -} - -#elif defined(_MSC_VER) - -static int _global_rpmalloc_xib(void) { - _global_rpmalloc_init(); - return 0; -} - -#pragma section(".CRT$XIB", read) -__declspec(allocate(".CRT$XIB")) void (*_rpmalloc_module_init)(void) = - _global_rpmalloc_xib; -#if defined(_M_IX86) || defined(__i386__) -#pragma comment(linker, "/include:" \ - "__rpmalloc_module_init") -#else -#pragma comment(linker, "/include:" \ - "_rpmalloc_module_init") -#endif - -#endif - -// end !BUILD_DYNAMIC_LINK -#endif - -#else - -#include -#include -#include -#include - -extern void rpmalloc_set_main_thread(void); - -static pthread_key_t destructor_key; - -static void thread_destructor(void *); - -static void __attribute__((constructor)) initializer(void) { - rpmalloc_set_main_thread(); - rpmalloc_initialize(); - pthread_key_create(&destructor_key, thread_destructor); -} - -static void __attribute__((destructor)) finalizer(void) { rpmalloc_finalize(); } - -typedef struct { - void *(*real_start)(void *); - void *real_arg; -} thread_starter_arg; - -static void *thread_starter(void *argptr) { - thread_starter_arg *arg = argptr; - void *(*real_start)(void *) = arg->real_start; - void *real_arg = arg->real_arg; - rpmalloc_thread_initialize(); - rpfree(argptr); - pthread_setspecific(destructor_key, (void *)1); - return (*real_start)(real_arg); -} - -static void thread_destructor(void *value) { - (void)sizeof(value); - rpmalloc_thread_finalize(1); -} - -#ifdef __APPLE__ - -static int pthread_create_proxy(pthread_t *thread, const pthread_attr_t *attr, - void *(*start_routine)(void *), void *arg) { - rpmalloc_initialize(); - thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); - starter_arg->real_start = start_routine; - starter_arg->real_arg = arg; - return pthread_create(thread, attr, thread_starter, starter_arg); -} - -MAC_INTERPOSE_SINGLE(pthread_create_proxy, pthread_create); - -#else - -#include - -int pthread_create(pthread_t *thread, const pthread_attr_t *attr, - void *(*start_routine)(void *), void *arg) { -#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ - defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) || \ - defined(__HAIKU__) - char fname[] = "pthread_create"; -#else - char fname[] = "_pthread_create"; -#endif - void *real_pthread_create = dlsym(RTLD_NEXT, fname); - rpmalloc_thread_initialize(); - thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); - starter_arg->real_start = start_routine; - starter_arg->real_arg = arg; - return (*(int (*)(pthread_t *, const pthread_attr_t *, void *(*)(void *), - void *))real_pthread_create)(thread, attr, thread_starter, - starter_arg); -} - -#endif - -#endif - -#endif - -#if ENABLE_OVERRIDE - -#if defined(__GLIBC__) && defined(__linux__) - -void *__libc_malloc(size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *__libc_calloc(size_t count, size_t size) - RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2) - RPALIAS(rpcalloc) void *__libc_realloc(void *p, size_t size) - RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) RPALIAS(rprealloc) void __libc_free(void *p) - RPALIAS(rpfree) void __libc_cfree(void *p) - RPALIAS(rpfree) void *__libc_memalign(size_t align, size_t size) - RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) - RPALIAS(rpmemalign) int __posix_memalign(void **p, size_t align, - size_t size) - RPALIAS(rpposix_memalign) - - extern void *__libc_valloc(size_t size); -extern void *__libc_pvalloc(size_t size); - -void *__libc_valloc(size_t size) { return valloc(size); } - -void *__libc_pvalloc(size_t size) { return pvalloc(size); } - -#endif - -#endif - -#if (defined(__GNUC__) || defined(__clang__)) -#pragma GCC visibility pop -#endif +//===------------------------ malloc.c ------------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +// +// This file provides overrides for the standard library malloc entry points for +// C and new/delete operators for C++ It also provides automatic +// initialization/finalization of process and threads +// +//===----------------------------------------------------------------------===// + +#if defined(__TINYC__) +#include +#endif + +#ifndef ARCH_64BIT +#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64) +#define ARCH_64BIT 1 +_Static_assert(sizeof(size_t) == 8, "Data type size mismatch"); +_Static_assert(sizeof(void *) == 8, "Data type size mismatch"); +#else +#define ARCH_64BIT 0 +_Static_assert(sizeof(size_t) == 4, "Data type size mismatch"); +_Static_assert(sizeof(void *) == 4, "Data type size mismatch"); +#endif +#endif + +#if (defined(__GNUC__) || defined(__clang__)) +#pragma GCC visibility push(default) +#endif + +#define USE_IMPLEMENT 1 +#define USE_INTERPOSE 0 +#define USE_ALIAS 0 + +#if defined(__APPLE__) +#undef USE_INTERPOSE +#define USE_INTERPOSE 1 + +typedef struct interpose_t { + void *new_func; + void *orig_func; +} interpose_t; + +#define MAC_INTERPOSE_PAIR(newf, oldf) {(void *)newf, (void *)oldf} +#define MAC_INTERPOSE_SINGLE(newf, oldf) \ + __attribute__((used)) static const interpose_t macinterpose##newf##oldf \ + __attribute__((section("__DATA, __interpose"))) = \ + MAC_INTERPOSE_PAIR(newf, oldf) + +#endif + +#if !defined(_WIN32) && !defined(__APPLE__) +#undef USE_IMPLEMENT +#undef USE_ALIAS +#define USE_IMPLEMENT 0 +#define USE_ALIAS 1 +#endif + +#ifdef _MSC_VER +#pragma warning(disable : 4100) +#undef malloc +#undef free +#undef calloc +#define RPMALLOC_RESTRICT __declspec(restrict) +#else +#define RPMALLOC_RESTRICT +#endif + +#if ENABLE_OVERRIDE + +typedef struct rp_nothrow_t { + int __dummy; +} rp_nothrow_t; + +#if USE_IMPLEMENT + +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL malloc(size_t size) { + return rpmalloc(size); +} +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL calloc(size_t count, + size_t size) { + return rpcalloc(count, size); +} +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL realloc(void *ptr, + size_t size) { + return rprealloc(ptr, size); +} +extern inline void *RPMALLOC_CDECL reallocf(void *ptr, size_t size) { + return rprealloc(ptr, size); +} +extern inline void *RPMALLOC_CDECL aligned_alloc(size_t alignment, + size_t size) { + return rpaligned_alloc(alignment, size); +} +extern inline void *RPMALLOC_CDECL memalign(size_t alignment, size_t size) { + return rpmemalign(alignment, size); +} +extern inline int RPMALLOC_CDECL posix_memalign(void **memptr, size_t alignment, + size_t size) { + return rpposix_memalign(memptr, alignment, size); +} +extern inline void RPMALLOC_CDECL free(void *ptr) { rpfree(ptr); } +extern inline void RPMALLOC_CDECL cfree(void *ptr) { rpfree(ptr); } +extern inline size_t RPMALLOC_CDECL malloc_usable_size(void *ptr) { + return rpmalloc_usable_size(ptr); +} +extern inline size_t RPMALLOC_CDECL malloc_size(void *ptr) { + return rpmalloc_usable_size(ptr); +} + +#ifdef _WIN32 +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _malloc_base(size_t size) { + return rpmalloc(size); +} +extern inline void RPMALLOC_CDECL _free_base(void *ptr) { rpfree(ptr); } +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _calloc_base(size_t count, + size_t size) { + return rpcalloc(count, size); +} +extern inline size_t RPMALLOC_CDECL _msize(void *ptr) { + return rpmalloc_usable_size(ptr); +} +extern inline size_t RPMALLOC_CDECL _msize_base(void *ptr) { + return rpmalloc_usable_size(ptr); +} +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL +_realloc_base(void *ptr, size_t size) { + return rprealloc(ptr, size); +} +#endif + +#ifdef _WIN32 +// For Windows, #include in one source file to get the C++ operator +// overrides implemented in your module +#else +// Overload the C++ operators using the mangled names +// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) operators +// delete and delete[] +#define RPDEFVIS __attribute__((visibility("default"))) +extern void _ZdlPv(void *p); +void RPDEFVIS _ZdlPv(void *p) { rpfree(p); } +extern void _ZdaPv(void *p); +void RPDEFVIS _ZdaPv(void *p) { rpfree(p); } +#if ARCH_64BIT +// 64-bit operators new and new[], normal and aligned +extern void *_Znwm(uint64_t size); +void *RPDEFVIS _Znwm(uint64_t size) { return rpmalloc(size); } +extern void *_Znam(uint64_t size); +void *RPDEFVIS _Znam(uint64_t size) { return rpmalloc(size); } +extern void *_Znwmm(uint64_t size, uint64_t align); +void *RPDEFVIS _Znwmm(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_Znamm(uint64_t size, uint64_t align); +void *RPDEFVIS _Znamm(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwmSt11align_val_t(uint64_t size, uint64_t align); +void *RPDEFVIS _ZnwmSt11align_val_t(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnamSt11align_val_t(uint64_t size, uint64_t align); +void *RPDEFVIS _ZnamSt11align_val_t(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +extern void *_ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +// 64-bit operators sized delete and delete[], normal and aligned +extern void _ZdlPvm(void *p, uint64_t size); +void RPDEFVIS _ZdlPvm(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdaPvm(void *p, uint64_t size); +void RPDEFVIS _ZdaPvm(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdlPvSt11align_val_t(void *p, uint64_t align); +void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t align) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdaPvSt11align_val_t(void *p, uint64_t align); +void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t align) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); +void RPDEFVIS _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(align); +} +extern void _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); +void RPDEFVIS _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(align); +} +#else +// 32-bit operators new and new[], normal and aligned +extern void *_Znwj(uint32_t size); +void *RPDEFVIS _Znwj(uint32_t size) { return rpmalloc(size); } +extern void *_Znaj(uint32_t size); +void *RPDEFVIS _Znaj(uint32_t size) { return rpmalloc(size); } +extern void *_Znwjj(uint32_t size, uint32_t align); +void *RPDEFVIS _Znwjj(uint32_t size, uint32_t align) { + return rpaligned_alloc(align, size); +} +extern void *_Znajj(uint32_t size, uint32_t align); +void *RPDEFVIS _Znajj(uint32_t size, uint32_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwjSt11align_val_t(size_t size, size_t align); +void *RPDEFVIS _ZnwjSt11align_val_t(size_t size, size_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnajSt11align_val_t(size_t size, size_t align); +void *RPDEFVIS _ZnajSt11align_val_t(size_t size, size_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +extern void *_ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +// 32-bit operators sized delete and delete[], normal and aligned +extern void _ZdlPvj(void *p, uint64_t size); +void RPDEFVIS _ZdlPvj(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdaPvj(void *p, uint64_t size); +void RPDEFVIS _ZdaPvj(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdlPvSt11align_val_t(void *p, uint32_t align); +void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t a) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdaPvSt11align_val_t(void *p, uint32_t align); +void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t a) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdlPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); +void RPDEFVIS _ZdlPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(a); +} +extern void _ZdaPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); +void RPDEFVIS _ZdaPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(a); +} +#endif +#endif +#endif + +#if USE_INTERPOSE || USE_ALIAS + +static void *rpmalloc_nothrow(size_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +static void *rpaligned_alloc_reverse(size_t size, size_t align) { + return rpaligned_alloc(align, size); +} +static void *rpaligned_alloc_reverse_nothrow(size_t size, size_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +static void rpfree_size(void *p, size_t size) { + (void)sizeof(size); + rpfree(p); +} +static void rpfree_aligned(void *p, size_t align) { + (void)sizeof(align); + rpfree(p); +} +static void rpfree_size_aligned(void *p, size_t size, size_t align) { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +#endif + +#if USE_INTERPOSE + +__attribute__((used)) static const interpose_t macinterpose_malloc[] + __attribute__((section("__DATA, __interpose"))) = { + // new and new[] + MAC_INTERPOSE_PAIR(rpmalloc, _Znwm), + MAC_INTERPOSE_PAIR(rpmalloc, _Znam), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znwmm), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znamm), + MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnwmRKSt9nothrow_t), + MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnamRKSt9nothrow_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnwmSt11align_val_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnamSt11align_val_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, + _ZnwmSt11align_val_tRKSt9nothrow_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, + _ZnamSt11align_val_tRKSt9nothrow_t), + // delete and delete[] + MAC_INTERPOSE_PAIR(rpfree, _ZdlPv), MAC_INTERPOSE_PAIR(rpfree, _ZdaPv), + MAC_INTERPOSE_PAIR(rpfree_size, _ZdlPvm), + MAC_INTERPOSE_PAIR(rpfree_size, _ZdaPvm), + MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdlPvSt11align_val_t), + MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdaPvSt11align_val_t), + MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdlPvmSt11align_val_t), + MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdaPvmSt11align_val_t), + // libc entry points + MAC_INTERPOSE_PAIR(rpmalloc, malloc), + MAC_INTERPOSE_PAIR(rpmalloc, calloc), + MAC_INTERPOSE_PAIR(rprealloc, realloc), + MAC_INTERPOSE_PAIR(rprealloc, reallocf), +#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15 + MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc), +#endif + MAC_INTERPOSE_PAIR(rpmemalign, memalign), + MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign), + MAC_INTERPOSE_PAIR(rpfree, free), MAC_INTERPOSE_PAIR(rpfree, cfree), + MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size), + MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size)}; + +#endif + +#if USE_ALIAS + +#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default"))); + +// Alias the C++ operators using the mangled names +// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) + +// operators delete and delete[] +void _ZdlPv(void *p) RPALIAS(rpfree) void _ZdaPv(void *p) RPALIAS(rpfree) + +#if ARCH_64BIT + // 64-bit operators new and new[], normal and aligned + void *_Znwm(uint64_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *_Znam(uint64_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwmm(uint64_t size, + uint64_t align) + RPALIAS(rpaligned_alloc_reverse) void *_Znamm(uint64_t size, + uint64_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwmSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnamSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwmRKSt9nothrow_t( + size_t size, rp_nothrow_t t) + RPALIAS(rpmalloc_nothrow) void *_ZnamRKSt9nothrow_t( + size_t size, + rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void + *_ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, + size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) void + *_ZnamSt11align_val_tRKSt9nothrow_t( + size_t size, size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) + // 64-bit operators delete and delete[], sized and aligned + void _ZdlPvm(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvm(void *p, + size_t n) + RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) + RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, + size_t a) + RPALIAS(rpfree_aligned) void _ZdlPvmSt11align_val_t(void *p, + size_t n, + size_t a) + RPALIAS(rpfree_size_aligned) void _ZdaPvmSt11align_val_t( + void *p, size_t n, size_t a) + RPALIAS(rpfree_size_aligned) +#else + // 32-bit operators new and new[], normal and aligned + void *_Znwj(uint32_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *_Znaj(uint32_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwjj(uint32_t size, + uint32_t align) + RPALIAS(rpaligned_alloc_reverse) void *_Znajj(uint32_t size, + uint32_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwjSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnajSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwjRKSt9nothrow_t( + size_t size, rp_nothrow_t t) + RPALIAS(rpmalloc_nothrow) void *_ZnajRKSt9nothrow_t( + size_t size, + rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void + *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, + size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) void + *_ZnajSt11align_val_tRKSt9nothrow_t( + size_t size, size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) + // 32-bit operators delete and delete[], sized and aligned + void _ZdlPvj(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvj(void *p, + size_t n) + RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) + RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, + size_t a) + RPALIAS(rpfree_aligned) void _ZdlPvjSt11align_val_t(void *p, + size_t n, + size_t a) + RPALIAS(rpfree_size_aligned) void _ZdaPvjSt11align_val_t( + void *p, size_t n, size_t a) + RPALIAS(rpfree_size_aligned) +#endif + + void *malloc(size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *calloc(size_t count, size_t size) + RPALIAS(rpcalloc) void *realloc(void *ptr, size_t size) + RPALIAS(rprealloc) void *reallocf(void *ptr, size_t size) + RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) + RPALIAS(rprealloc) void *aligned_alloc(size_t alignment, size_t size) + RPALIAS(rpaligned_alloc) void *memalign( + size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) + RPALIAS(rpmemalign) int posix_memalign(void **memptr, size_t alignment, + size_t size) + RPALIAS(rpposix_memalign) void free(void *ptr) + RPALIAS(rpfree) void cfree(void *ptr) RPALIAS(rpfree) +#if defined(__ANDROID__) || defined(__FreeBSD__) + size_t + malloc_usable_size(const void *ptr) RPALIAS(rpmalloc_usable_size) +#else + size_t + malloc_usable_size(void *ptr) RPALIAS(rpmalloc_usable_size) +#endif + size_t malloc_size(void *ptr) RPALIAS(rpmalloc_usable_size) + +#endif + + static inline size_t _rpmalloc_page_size(void) { + return _memory_page_size; +} + +extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size); + +extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#ifdef _MSC_VER + int err = SizeTMult(count, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(count, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = count * size; +#endif + return realloc(ptr, total); +} + +extern inline void *RPMALLOC_CDECL valloc(size_t size) { + get_thread_heap(); + return rpaligned_alloc(_rpmalloc_page_size(), size); +} + +extern inline void *RPMALLOC_CDECL pvalloc(size_t size) { + get_thread_heap(); + const size_t page_size = _rpmalloc_page_size(); + const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size; +#if ENABLE_VALIDATE_ARGS + if (aligned_size < size) { + errno = EINVAL; + return 0; + } +#endif + return rpaligned_alloc(_rpmalloc_page_size(), aligned_size); +} + +#endif // ENABLE_OVERRIDE + +#if ENABLE_PRELOAD + +#ifdef _WIN32 + +#if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK + +extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, + DWORD reason, LPVOID reserved); + +extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, + DWORD reason, + LPVOID reserved) { + (void)sizeof(reserved); + (void)sizeof(instance); + if (reason == DLL_PROCESS_ATTACH) + rpmalloc_initialize(); + else if (reason == DLL_PROCESS_DETACH) + rpmalloc_finalize(); + else if (reason == DLL_THREAD_ATTACH) + rpmalloc_thread_initialize(); + else if (reason == DLL_THREAD_DETACH) + rpmalloc_thread_finalize(1); + return TRUE; +} + +// end BUILD_DYNAMIC_LINK +#else + +extern void _global_rpmalloc_init(void) { + rpmalloc_set_main_thread(); + rpmalloc_initialize(); +} + +#if defined(__clang__) || defined(__GNUC__) + +static void __attribute__((constructor)) initializer(void) { + _global_rpmalloc_init(); +} + +#elif defined(_MSC_VER) + +static int _global_rpmalloc_xib(void) { + _global_rpmalloc_init(); + return 0; +} + +#pragma section(".CRT$XIB", read) +__declspec(allocate(".CRT$XIB")) void (*_rpmalloc_module_init)(void) = + _global_rpmalloc_xib; +#if defined(_M_IX86) || defined(__i386__) +#pragma comment(linker, "/include:" \ + "__rpmalloc_module_init") +#else +#pragma comment(linker, "/include:" \ + "_rpmalloc_module_init") +#endif + +#endif + +// end !BUILD_DYNAMIC_LINK +#endif + +#else + +#include +#include +#include +#include + +extern void rpmalloc_set_main_thread(void); + +static pthread_key_t destructor_key; + +static void thread_destructor(void *); + +static void __attribute__((constructor)) initializer(void) { + rpmalloc_set_main_thread(); + rpmalloc_initialize(); + pthread_key_create(&destructor_key, thread_destructor); +} + +static void __attribute__((destructor)) finalizer(void) { rpmalloc_finalize(); } + +typedef struct { + void *(*real_start)(void *); + void *real_arg; +} thread_starter_arg; + +static void *thread_starter(void *argptr) { + thread_starter_arg *arg = argptr; + void *(*real_start)(void *) = arg->real_start; + void *real_arg = arg->real_arg; + rpmalloc_thread_initialize(); + rpfree(argptr); + pthread_setspecific(destructor_key, (void *)1); + return (*real_start)(real_arg); +} + +static void thread_destructor(void *value) { + (void)sizeof(value); + rpmalloc_thread_finalize(1); +} + +#ifdef __APPLE__ + +static int pthread_create_proxy(pthread_t *thread, const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) { + rpmalloc_initialize(); + thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); + starter_arg->real_start = start_routine; + starter_arg->real_arg = arg; + return pthread_create(thread, attr, thread_starter, starter_arg); +} + +MAC_INTERPOSE_SINGLE(pthread_create_proxy, pthread_create); + +#else + +#include + +int pthread_create(pthread_t *thread, const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) { +#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ + defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) || \ + defined(__HAIKU__) + char fname[] = "pthread_create"; +#else + char fname[] = "_pthread_create"; +#endif + void *real_pthread_create = dlsym(RTLD_NEXT, fname); + rpmalloc_thread_initialize(); + thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); + starter_arg->real_start = start_routine; + starter_arg->real_arg = arg; + return (*(int (*)(pthread_t *, const pthread_attr_t *, void *(*)(void *), + void *))real_pthread_create)(thread, attr, thread_starter, + starter_arg); +} + +#endif + +#endif + +#endif + +#if ENABLE_OVERRIDE + +#if defined(__GLIBC__) && defined(__linux__) + +void *__libc_malloc(size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *__libc_calloc(size_t count, size_t size) + RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2) + RPALIAS(rpcalloc) void *__libc_realloc(void *p, size_t size) + RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) RPALIAS(rprealloc) void __libc_free(void *p) + RPALIAS(rpfree) void __libc_cfree(void *p) + RPALIAS(rpfree) void *__libc_memalign(size_t align, size_t size) + RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) + RPALIAS(rpmemalign) int __posix_memalign(void **p, size_t align, + size_t size) + RPALIAS(rpposix_memalign) + + extern void *__libc_valloc(size_t size); +extern void *__libc_pvalloc(size_t size); + +void *__libc_valloc(size_t size) { return valloc(size); } + +void *__libc_pvalloc(size_t size) { return pvalloc(size); } + +#endif + +#endif + +#if (defined(__GNUC__) || defined(__clang__)) +#pragma GCC visibility pop +#endif diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.c b/llvm/lib/Support/rpmalloc/rpmalloc.c index a06d3cdb5b52ef25e54bf6f595d4d5ab300a26d0..0976ec8ae6af4e94af86cd1ae6118266c22f5678 100644 --- a/llvm/lib/Support/rpmalloc/rpmalloc.c +++ b/llvm/lib/Support/rpmalloc/rpmalloc.c @@ -1,3992 +1,3992 @@ -//===---------------------- rpmalloc.c ------------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#include "rpmalloc.h" - -//////////// -/// -/// Build time configurable limits -/// -////// - -#if defined(__clang__) -#pragma clang diagnostic ignored "-Wunused-macros" -#pragma clang diagnostic ignored "-Wunused-function" -#if __has_warning("-Wreserved-identifier") -#pragma clang diagnostic ignored "-Wreserved-identifier" -#endif -#if __has_warning("-Wstatic-in-inline") -#pragma clang diagnostic ignored "-Wstatic-in-inline" -#endif -#elif defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wunused-macros" -#pragma GCC diagnostic ignored "-Wunused-function" -#endif - -#if !defined(__has_builtin) -#define __has_builtin(b) 0 -#endif - -#if defined(__GNUC__) || defined(__clang__) - -#if __has_builtin(__builtin_memcpy_inline) -#define _rpmalloc_memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s) -#else -#define _rpmalloc_memcpy_const(x, y, s) \ - do { \ - _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ - "len must be a constant integer"); \ - memcpy(x, y, s); \ - } while (0) -#endif - -#if __has_builtin(__builtin_memset_inline) -#define _rpmalloc_memset_const(x, y, s) __builtin_memset_inline(x, y, s) -#else -#define _rpmalloc_memset_const(x, y, s) \ - do { \ - _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ - "len must be a constant integer"); \ - memset(x, y, s); \ - } while (0) -#endif -#else -#define _rpmalloc_memcpy_const(x, y, s) memcpy(x, y, s) -#define _rpmalloc_memset_const(x, y, s) memset(x, y, s) -#endif - -#if __has_builtin(__builtin_assume) -#define rpmalloc_assume(cond) __builtin_assume(cond) -#elif defined(__GNUC__) -#define rpmalloc_assume(cond) \ - do { \ - if (!__builtin_expect(cond, 0)) \ - __builtin_unreachable(); \ - } while (0) -#elif defined(_MSC_VER) -#define rpmalloc_assume(cond) __assume(cond) -#else -#define rpmalloc_assume(cond) 0 -#endif - -#ifndef HEAP_ARRAY_SIZE -//! Size of heap hashmap -#define HEAP_ARRAY_SIZE 47 -#endif -#ifndef ENABLE_THREAD_CACHE -//! Enable per-thread cache -#define ENABLE_THREAD_CACHE 1 -#endif -#ifndef ENABLE_GLOBAL_CACHE -//! Enable global cache shared between all threads, requires thread cache -#define ENABLE_GLOBAL_CACHE 1 -#endif -#ifndef ENABLE_VALIDATE_ARGS -//! Enable validation of args to public entry points -#define ENABLE_VALIDATE_ARGS 0 -#endif -#ifndef ENABLE_STATISTICS -//! Enable statistics collection -#define ENABLE_STATISTICS 0 -#endif -#ifndef ENABLE_ASSERTS -//! Enable asserts -#define ENABLE_ASSERTS 0 -#endif -#ifndef ENABLE_OVERRIDE -//! Override standard library malloc/free and new/delete entry points -#define ENABLE_OVERRIDE 0 -#endif -#ifndef ENABLE_PRELOAD -//! Support preloading -#define ENABLE_PRELOAD 0 -#endif -#ifndef DISABLE_UNMAP -//! Disable unmapping memory pages (also enables unlimited cache) -#define DISABLE_UNMAP 0 -#endif -#ifndef ENABLE_UNLIMITED_CACHE -//! Enable unlimited global cache (no unmapping until finalization) -#define ENABLE_UNLIMITED_CACHE 0 -#endif -#ifndef ENABLE_ADAPTIVE_THREAD_CACHE -//! Enable adaptive thread cache size based on use heuristics -#define ENABLE_ADAPTIVE_THREAD_CACHE 0 -#endif -#ifndef DEFAULT_SPAN_MAP_COUNT -//! Default number of spans to map in call to map more virtual memory (default -//! values yield 4MiB here) -#define DEFAULT_SPAN_MAP_COUNT 64 -#endif -#ifndef GLOBAL_CACHE_MULTIPLIER -//! Multiplier for global cache -#define GLOBAL_CACHE_MULTIPLIER 8 -#endif - -#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE -#error Must use global cache if unmap is disabled -#endif - -#if DISABLE_UNMAP -#undef ENABLE_UNLIMITED_CACHE -#define ENABLE_UNLIMITED_CACHE 1 -#endif - -#if !ENABLE_GLOBAL_CACHE -#undef ENABLE_UNLIMITED_CACHE -#define ENABLE_UNLIMITED_CACHE 0 -#endif - -#if !ENABLE_THREAD_CACHE -#undef ENABLE_ADAPTIVE_THREAD_CACHE -#define ENABLE_ADAPTIVE_THREAD_CACHE 0 -#endif - -#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64) -#define PLATFORM_WINDOWS 1 -#define PLATFORM_POSIX 0 -#else -#define PLATFORM_WINDOWS 0 -#define PLATFORM_POSIX 1 -#endif - -/// Platform and arch specifics -#if defined(_MSC_VER) && !defined(__clang__) -#pragma warning(disable : 5105) -#ifndef FORCEINLINE -#define FORCEINLINE inline __forceinline -#endif -#define _Static_assert static_assert -#else -#ifndef FORCEINLINE -#define FORCEINLINE inline __attribute__((__always_inline__)) -#endif -#endif -#if PLATFORM_WINDOWS -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include -#if ENABLE_VALIDATE_ARGS -#include -#endif -#else -#include -#include -#include -#include -#if defined(__linux__) || defined(__ANDROID__) -#include -#if !defined(PR_SET_VMA) -#define PR_SET_VMA 0x53564d41 -#define PR_SET_VMA_ANON_NAME 0 -#endif -#endif -#if defined(__APPLE__) -#include -#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR -#include -#include -#endif -#include -#endif -#if defined(__HAIKU__) || defined(__TINYC__) -#include -#endif -#endif - -#include -#include -#include - -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) -#include -static DWORD fls_key; -#endif - -#if PLATFORM_POSIX -#include -#include -#ifdef __FreeBSD__ -#include -#define MAP_HUGETLB MAP_ALIGNED_SUPER -#ifndef PROT_MAX -#define PROT_MAX(f) 0 -#endif -#else -#define PROT_MAX(f) 0 -#endif -#ifdef __sun -extern int madvise(caddr_t, size_t, int); -#endif -#ifndef MAP_UNINITIALIZED -#define MAP_UNINITIALIZED 0 -#endif -#endif -#include - -#if ENABLE_ASSERTS -#undef NDEBUG -#if defined(_MSC_VER) && !defined(_DEBUG) -#define _DEBUG -#endif -#include -#define RPMALLOC_TOSTRING_M(x) #x -#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x) -#define rpmalloc_assert(truth, message) \ - do { \ - if (!(truth)) { \ - if (_memory_config.error_callback) { \ - _memory_config.error_callback(message " (" RPMALLOC_TOSTRING( \ - truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \ - } else { \ - assert((truth) && message); \ - } \ - } \ - } while (0) -#else -#define rpmalloc_assert(truth, message) \ - do { \ - } while (0) -#endif -#if ENABLE_STATISTICS -#include -#endif - -////// -/// -/// Atomic access abstraction (since MSVC does not do C11 yet) -/// -////// - -#if defined(_MSC_VER) && !defined(__clang__) - -typedef volatile long atomic32_t; -typedef volatile long long atomic64_t; -typedef volatile void *atomicptr_t; - -static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { return *src; } -static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { - *dst = val; -} -static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { - return (int32_t)InterlockedIncrement(val); -} -static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { - return (int32_t)InterlockedDecrement(val); -} -static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { - return (int32_t)InterlockedExchangeAdd(val, add) + add; -} -static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, - int32_t ref) { - return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; -} -static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { - *dst = val; -} -static FORCEINLINE int64_t atomic_load64(atomic64_t *src) { return *src; } -static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { - return (int64_t)InterlockedExchangeAdd64(val, add) + add; -} -static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { - return (void *)*src; -} -static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { - *dst = val; -} -static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { - *dst = val; -} -static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, - void *val) { - return (void *)InterlockedExchangePointer((void *volatile *)dst, val); -} -static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { - return (InterlockedCompareExchangePointer((void *volatile *)dst, val, ref) == - ref) - ? 1 - : 0; -} - -#define EXPECTED(x) (x) -#define UNEXPECTED(x) (x) - -#else - -#include - -typedef volatile _Atomic(int32_t) atomic32_t; -typedef volatile _Atomic(int64_t) atomic64_t; -typedef volatile _Atomic(void *) atomicptr_t; - -static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { - return atomic_load_explicit(src, memory_order_relaxed); -} -static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { - atomic_store_explicit(dst, val, memory_order_relaxed); -} -static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { - return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; -} -static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { - return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; -} -static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { - return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; -} -static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, - int32_t ref) { - return atomic_compare_exchange_weak_explicit( - dst, &ref, val, memory_order_acquire, memory_order_relaxed); -} -static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { - atomic_store_explicit(dst, val, memory_order_release); -} -static FORCEINLINE int64_t atomic_load64(atomic64_t *val) { - return atomic_load_explicit(val, memory_order_relaxed); -} -static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { - return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; -} -static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { - return atomic_load_explicit(src, memory_order_relaxed); -} -static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { - atomic_store_explicit(dst, val, memory_order_relaxed); -} -static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { - atomic_store_explicit(dst, val, memory_order_release); -} -static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, - void *val) { - return atomic_exchange_explicit(dst, val, memory_order_acquire); -} -static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { - return atomic_compare_exchange_weak_explicit( - dst, &ref, val, memory_order_relaxed, memory_order_relaxed); -} - -#define EXPECTED(x) __builtin_expect((x), 1) -#define UNEXPECTED(x) __builtin_expect((x), 0) - -#endif - -//////////// -/// -/// Statistics related functions (evaluate to nothing when statistics not -/// enabled) -/// -////// - -#if ENABLE_STATISTICS -#define _rpmalloc_stat_inc(counter) atomic_incr32(counter) -#define _rpmalloc_stat_dec(counter) atomic_decr32(counter) -#define _rpmalloc_stat_add(counter, value) \ - atomic_add32(counter, (int32_t)(value)) -#define _rpmalloc_stat_add64(counter, value) \ - atomic_add64(counter, (int64_t)(value)) -#define _rpmalloc_stat_add_peak(counter, value, peak) \ - do { \ - int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); \ - if (_cur_count > (peak)) \ - peak = _cur_count; \ - } while (0) -#define _rpmalloc_stat_sub(counter, value) \ - atomic_add32(counter, -(int32_t)(value)) -#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ - do { \ - int32_t alloc_current = \ - atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ - if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ - heap->size_class_use[class_idx].alloc_peak = alloc_current; \ - atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \ - } while (0) -#define _rpmalloc_stat_inc_free(heap, class_idx) \ - do { \ - atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ - atomic_incr32(&heap->size_class_use[class_idx].free_total); \ - } while (0) -#else -#define _rpmalloc_stat_inc(counter) \ - do { \ - } while (0) -#define _rpmalloc_stat_dec(counter) \ - do { \ - } while (0) -#define _rpmalloc_stat_add(counter, value) \ - do { \ - } while (0) -#define _rpmalloc_stat_add64(counter, value) \ - do { \ - } while (0) -#define _rpmalloc_stat_add_peak(counter, value, peak) \ - do { \ - } while (0) -#define _rpmalloc_stat_sub(counter, value) \ - do { \ - } while (0) -#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ - do { \ - } while (0) -#define _rpmalloc_stat_inc_free(heap, class_idx) \ - do { \ - } while (0) -#endif - -/// -/// Preconfigured limits and sizes -/// - -//! Granularity of a small allocation block (must be power of two) -#define SMALL_GRANULARITY 16 -//! Small granularity shift count -#define SMALL_GRANULARITY_SHIFT 4 -//! Number of small block size classes -#define SMALL_CLASS_COUNT 65 -//! Maximum size of a small block -#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1)) -//! Granularity of a medium allocation block -#define MEDIUM_GRANULARITY 512 -//! Medium granularity shift count -#define MEDIUM_GRANULARITY_SHIFT 9 -//! Number of medium block size classes -#define MEDIUM_CLASS_COUNT 61 -//! Total number of small + medium size classes -#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) -//! Number of large block size classes -#define LARGE_CLASS_COUNT 63 -//! Maximum size of a medium block -#define MEDIUM_SIZE_LIMIT \ - (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) -//! Maximum size of a large block -#define LARGE_SIZE_LIMIT \ - ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) -//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power -//! of two) -#define SPAN_HEADER_SIZE 128 -//! Number of spans in thread cache -#define MAX_THREAD_SPAN_CACHE 400 -//! Number of spans to transfer between thread and global cache -#define THREAD_SPAN_CACHE_TRANSFER 64 -//! Number of spans in thread cache for large spans (must be greater than -//! LARGE_CLASS_COUNT / 2) -#define MAX_THREAD_SPAN_LARGE_CACHE 100 -//! Number of spans to transfer between thread and global cache for large spans -#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6 - -_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, - "Small granularity must be power of two"); -_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, - "Span header size must be power of two"); - -#if ENABLE_VALIDATE_ARGS -//! Maximum allocation size to avoid integer overflow -#undef MAX_ALLOC_SIZE -#define MAX_ALLOC_SIZE (((size_t) - 1) - _memory_span_size) -#endif - -#define pointer_offset(ptr, ofs) (void *)((char *)(ptr) + (ptrdiff_t)(ofs)) -#define pointer_diff(first, second) \ - (ptrdiff_t)((const char *)(first) - (const char *)(second)) - -#define INVALID_POINTER ((void *)((uintptr_t) - 1)) - -#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT -#define SIZE_CLASS_HUGE ((uint32_t) - 1) - -//////////// -/// -/// Data types -/// -////// - -//! A memory heap, per thread -typedef struct heap_t heap_t; -//! Span of memory pages -typedef struct span_t span_t; -//! Span list -typedef struct span_list_t span_list_t; -//! Span active data -typedef struct span_active_t span_active_t; -//! Size class definition -typedef struct size_class_t size_class_t; -//! Global cache -typedef struct global_cache_t global_cache_t; - -//! Flag indicating span is the first (master) span of a split superspan -#define SPAN_FLAG_MASTER 1U -//! Flag indicating span is a secondary (sub) span of a split superspan -#define SPAN_FLAG_SUBSPAN 2U -//! Flag indicating span has blocks with increased alignment -#define SPAN_FLAG_ALIGNED_BLOCKS 4U -//! Flag indicating an unmapped master span -#define SPAN_FLAG_UNMAPPED_MASTER 8U - -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS -struct span_use_t { - //! Current number of spans used (actually used, not in cache) - atomic32_t current; - //! High water mark of spans used - atomic32_t high; -#if ENABLE_STATISTICS - //! Number of spans in deferred list - atomic32_t spans_deferred; - //! Number of spans transitioned to global cache - atomic32_t spans_to_global; - //! Number of spans transitioned from global cache - atomic32_t spans_from_global; - //! Number of spans transitioned to thread cache - atomic32_t spans_to_cache; - //! Number of spans transitioned from thread cache - atomic32_t spans_from_cache; - //! Number of spans transitioned to reserved state - atomic32_t spans_to_reserved; - //! Number of spans transitioned from reserved state - atomic32_t spans_from_reserved; - //! Number of raw memory map calls - atomic32_t spans_map_calls; -#endif -}; -typedef struct span_use_t span_use_t; -#endif - -#if ENABLE_STATISTICS -struct size_class_use_t { - //! Current number of allocations - atomic32_t alloc_current; - //! Peak number of allocations - int32_t alloc_peak; - //! Total number of allocations - atomic32_t alloc_total; - //! Total number of frees - atomic32_t free_total; - //! Number of spans in use - atomic32_t spans_current; - //! Number of spans transitioned to cache - int32_t spans_peak; - //! Number of spans transitioned to cache - atomic32_t spans_to_cache; - //! Number of spans transitioned from cache - atomic32_t spans_from_cache; - //! Number of spans transitioned from reserved state - atomic32_t spans_from_reserved; - //! Number of spans mapped - atomic32_t spans_map_calls; - int32_t unused; -}; -typedef struct size_class_use_t size_class_use_t; -#endif - -// A span can either represent a single span of memory pages with size declared -// by span_map_count configuration variable, or a set of spans in a continuous -// region, a super span. Any reference to the term "span" usually refers to both -// a single span or a super span. A super span can further be divided into -// multiple spans (or this, super spans), where the first (super)span is the -// master and subsequent (super)spans are subspans. The master span keeps track -// of how many subspans that are still alive and mapped in virtual memory, and -// once all subspans and master have been unmapped the entire superspan region -// is released and unmapped (on Windows for example, the entire superspan range -// has to be released in the same call to release the virtual memory range, but -// individual subranges can be decommitted individually to reduce physical -// memory use). -struct span_t { - //! Free list - void *free_list; - //! Total block count of size class - uint32_t block_count; - //! Size class - uint32_t size_class; - //! Index of last block initialized in free list - uint32_t free_list_limit; - //! Number of used blocks remaining when in partial state - uint32_t used_count; - //! Deferred free list - atomicptr_t free_list_deferred; - //! Size of deferred free list, or list of spans when part of a cache list - uint32_t list_size; - //! Size of a block - uint32_t block_size; - //! Flags and counters - uint32_t flags; - //! Number of spans - uint32_t span_count; - //! Total span counter for master spans - uint32_t total_spans; - //! Offset from master span for subspans - uint32_t offset_from_master; - //! Remaining span counter, for master spans - atomic32_t remaining_spans; - //! Alignment offset - uint32_t align_offset; - //! Owning heap - heap_t *heap; - //! Next span - span_t *next; - //! Previous span - span_t *prev; -}; -_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); - -struct span_cache_t { - size_t count; - span_t *span[MAX_THREAD_SPAN_CACHE]; -}; -typedef struct span_cache_t span_cache_t; - -struct span_large_cache_t { - size_t count; - span_t *span[MAX_THREAD_SPAN_LARGE_CACHE]; -}; -typedef struct span_large_cache_t span_large_cache_t; - -struct heap_size_class_t { - //! Free list of active span - void *free_list; - //! Double linked list of partially used spans with free blocks. - // Previous span pointer in head points to tail span of list. - span_t *partial_span; - //! Early level cache of fully free spans - span_t *cache; -}; -typedef struct heap_size_class_t heap_size_class_t; - -// Control structure for a heap, either a thread heap or a first class heap if -// enabled -struct heap_t { - //! Owning thread ID - uintptr_t owner_thread; - //! Free lists for each size class - heap_size_class_t size_class[SIZE_CLASS_COUNT]; -#if ENABLE_THREAD_CACHE - //! Arrays of fully freed spans, single span - span_cache_t span_cache; -#endif - //! List of deferred free spans (single linked list) - atomicptr_t span_free_deferred; - //! Number of full spans - size_t full_span_count; - //! Mapped but unused spans - span_t *span_reserve; - //! Master span for mapped but unused spans - span_t *span_reserve_master; - //! Number of mapped but unused spans - uint32_t spans_reserved; - //! Child count - atomic32_t child_count; - //! Next heap in id list - heap_t *next_heap; - //! Next heap in orphan list - heap_t *next_orphan; - //! Heap ID - int32_t id; - //! Finalization state flag - int finalize; - //! Master heap owning the memory pages - heap_t *master_heap; -#if ENABLE_THREAD_CACHE - //! Arrays of fully freed spans, large spans with > 1 span count - span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1]; -#endif -#if RPMALLOC_FIRST_CLASS_HEAPS - //! Double linked list of fully utilized spans with free blocks for each size - //! class. - // Previous span pointer in head points to tail span of list. - span_t *full_span[SIZE_CLASS_COUNT]; - //! Double linked list of large and huge spans allocated by this heap - span_t *large_huge_span; -#endif -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - //! Current and high water mark of spans used per span count - span_use_t span_use[LARGE_CLASS_COUNT]; -#endif -#if ENABLE_STATISTICS - //! Allocation stats per size class - size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; - //! Number of bytes transitioned thread -> global - atomic64_t thread_to_global; - //! Number of bytes transitioned global -> thread - atomic64_t global_to_thread; -#endif -}; - -// Size class for defining a block size bucket -struct size_class_t { - //! Size of blocks in this class - uint32_t block_size; - //! Number of blocks in each chunk - uint16_t block_count; - //! Class index this class is merged with - uint16_t class_idx; -}; -_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); - -struct global_cache_t { - //! Cache lock - atomic32_t lock; - //! Cache count - uint32_t count; -#if ENABLE_STATISTICS - //! Insert count - size_t insert_count; - //! Extract count - size_t extract_count; -#endif - //! Cached spans - span_t *span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE]; - //! Unlimited cache overflow - span_t *overflow; -}; - -//////////// -/// -/// Global data -/// -////// - -//! Default span size (64KiB) -#define _memory_default_span_size (64 * 1024) -#define _memory_default_span_size_shift 16 -#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1))) - -//! Initialized flag -static int _rpmalloc_initialized; -//! Main thread ID -static uintptr_t _rpmalloc_main_thread_id; -//! Configuration -static rpmalloc_config_t _memory_config; -//! Memory page size -static size_t _memory_page_size; -//! Shift to divide by page size -static size_t _memory_page_size_shift; -//! Granularity at which memory pages are mapped by OS -static size_t _memory_map_granularity; -#if RPMALLOC_CONFIGURABLE -//! Size of a span of memory pages -static size_t _memory_span_size; -//! Shift to divide by span size -static size_t _memory_span_size_shift; -//! Mask to get to start of a memory span -static uintptr_t _memory_span_mask; -#else -//! Hardwired span size -#define _memory_span_size _memory_default_span_size -#define _memory_span_size_shift _memory_default_span_size_shift -#define _memory_span_mask _memory_default_span_mask -#endif -//! Number of spans to map in each map call -static size_t _memory_span_map_count; -//! Number of spans to keep reserved in each heap -static size_t _memory_heap_reserve_count; -//! Global size classes -static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; -//! Run-time size limit of medium blocks -static size_t _memory_medium_size_limit; -//! Heap ID counter -static atomic32_t _memory_heap_id; -//! Huge page support -static int _memory_huge_pages; -#if ENABLE_GLOBAL_CACHE -//! Global span cache -static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; -#endif -//! Global reserved spans -static span_t *_memory_global_reserve; -//! Global reserved count -static size_t _memory_global_reserve_count; -//! Global reserved master -static span_t *_memory_global_reserve_master; -//! All heaps -static heap_t *_memory_heaps[HEAP_ARRAY_SIZE]; -//! Used to restrict access to mapping memory for huge pages -static atomic32_t _memory_global_lock; -//! Orphaned heaps -static heap_t *_memory_orphan_heaps; -#if RPMALLOC_FIRST_CLASS_HEAPS -//! Orphaned heaps (first class heaps) -static heap_t *_memory_first_class_orphan_heaps; -#endif -#if ENABLE_STATISTICS -//! Allocations counter -static atomic64_t _allocation_counter; -//! Deallocations counter -static atomic64_t _deallocation_counter; -//! Active heap count -static atomic32_t _memory_active_heaps; -//! Number of currently mapped memory pages -static atomic32_t _mapped_pages; -//! Peak number of concurrently mapped memory pages -static int32_t _mapped_pages_peak; -//! Number of mapped master spans -static atomic32_t _master_spans; -//! Number of unmapped dangling master spans -static atomic32_t _unmapped_master_spans; -//! Running counter of total number of mapped memory pages since start -static atomic32_t _mapped_total; -//! Running counter of total number of unmapped memory pages since start -static atomic32_t _unmapped_total; -//! Number of currently mapped memory pages in OS calls -static atomic32_t _mapped_pages_os; -//! Number of currently allocated pages in huge allocations -static atomic32_t _huge_pages_current; -//! Peak number of currently allocated pages in huge allocations -static int32_t _huge_pages_peak; -#endif - -//////////// -/// -/// Thread local heap and ID -/// -////// - -//! Current thread heap -#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ - defined(__TINYC__) -static pthread_key_t _memory_thread_heap; -#else -#ifdef _MSC_VER -#define _Thread_local __declspec(thread) -#define TLS_MODEL -#else -#ifndef __HAIKU__ -#define TLS_MODEL __attribute__((tls_model("initial-exec"))) -#else -#define TLS_MODEL -#endif -#if !defined(__clang__) && defined(__GNUC__) -#define _Thread_local __thread -#endif -#endif -static _Thread_local heap_t *_memory_thread_heap TLS_MODEL; -#endif - -static inline heap_t *get_thread_heap_raw(void) { -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - return pthread_getspecific(_memory_thread_heap); -#else - return _memory_thread_heap; -#endif -} - -//! Get the current thread heap -static inline heap_t *get_thread_heap(void) { - heap_t *heap = get_thread_heap_raw(); -#if ENABLE_PRELOAD - if (EXPECTED(heap != 0)) - return heap; - rpmalloc_initialize(); - return get_thread_heap_raw(); -#else - return heap; -#endif -} - -//! Fast thread ID -static inline uintptr_t get_thread_id(void) { -#if defined(_WIN32) - return (uintptr_t)((void *)NtCurrentTeb()); -#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) - uintptr_t tid; -#if defined(__i386__) - __asm__("movl %%gs:0, %0" : "=r"(tid) : :); -#elif defined(__x86_64__) -#if defined(__MACH__) - __asm__("movq %%gs:0, %0" : "=r"(tid) : :); -#else - __asm__("movq %%fs:0, %0" : "=r"(tid) : :); -#endif -#elif defined(__arm__) - __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid)); -#elif defined(__aarch64__) -#if defined(__MACH__) - // tpidr_el0 likely unused, always return 0 on iOS - __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid)); -#else - __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid)); -#endif -#else -#error This platform needs implementation of get_thread_id() -#endif - return tid; -#else -#error This platform needs implementation of get_thread_id() -#endif -} - -//! Set the current thread heap -static void set_thread_heap(heap_t *heap) { -#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ - defined(__TINYC__) - pthread_setspecific(_memory_thread_heap, heap); -#else - _memory_thread_heap = heap; -#endif - if (heap) - heap->owner_thread = get_thread_id(); -} - -//! Set main thread ID -extern void rpmalloc_set_main_thread(void); - -void rpmalloc_set_main_thread(void) { - _rpmalloc_main_thread_id = get_thread_id(); -} - -static void _rpmalloc_spin(void) { -#if defined(_MSC_VER) -#if defined(_M_ARM64) - __yield(); -#else - _mm_pause(); -#endif -#elif defined(__x86_64__) || defined(__i386__) - __asm__ volatile("pause" ::: "memory"); -#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7) - __asm__ volatile("yield" ::: "memory"); -#elif defined(__powerpc__) || defined(__powerpc64__) - // No idea if ever been compiled in such archs but ... as precaution - __asm__ volatile("or 27,27,27"); -#elif defined(__sparc__) - __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0"); -#else - struct timespec ts = {0}; - nanosleep(&ts, 0); -#endif -} - -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) -static void NTAPI _rpmalloc_thread_destructor(void *value) { -#if ENABLE_OVERRIDE - // If this is called on main thread it means rpmalloc_finalize - // has not been called and shutdown is forced (through _exit) or unclean - if (get_thread_id() == _rpmalloc_main_thread_id) - return; -#endif - if (value) - rpmalloc_thread_finalize(1); -} -#endif - -//////////// -/// -/// Low level memory map/unmap -/// -////// - -static void _rpmalloc_set_name(void *address, size_t size) { -#if defined(__linux__) || defined(__ANDROID__) - const char *name = _memory_huge_pages ? _memory_config.huge_page_name - : _memory_config.page_name; - if (address == MAP_FAILED || !name) - return; - // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails - // (e.g. invalid name) it is a no-op basically. - (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, - (uintptr_t)name); -#else - (void)sizeof(size); - (void)sizeof(address); -#endif -} - -//! Map more virtual memory -// size is number of bytes to map -// offset receives the offset in bytes from start of mapped region -// returns address to start of mapped region to use -static void *_rpmalloc_mmap(size_t size, size_t *offset) { - rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size"); - rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); - void *address = _memory_config.memory_map(size, offset); - if (EXPECTED(address != 0)) { - _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), - _mapped_pages_peak); - _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift)); - } - return address; -} - -//! Unmap virtual memory -// address is the memory address to unmap, as returned from _memory_map -// size is the number of bytes to unmap, which might be less than full region -// for a partial unmap offset is the offset in bytes to the actual mapped -// region, as set by _memory_map release is set to 0 for partial unmap, or size -// of entire range for a full unmap -static void _rpmalloc_unmap(void *address, size_t size, size_t offset, - size_t release) { - rpmalloc_assert(!release || (release >= size), "Invalid unmap size"); - rpmalloc_assert(!release || (release >= _memory_page_size), - "Invalid unmap size"); - if (release) { - rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size"); - _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift)); - _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift)); - } - _memory_config.memory_unmap(address, size, offset, release); -} - -//! Default implementation to map new pages to virtual memory -static void *_rpmalloc_mmap_os(size_t size, size_t *offset) { - // Either size is a heap (a single page) or a (multiple) span - we only need - // to align spans, and only if larger than map granularity - size_t padding = ((size >= _memory_span_size) && - (_memory_span_size > _memory_map_granularity)) - ? _memory_span_size - : 0; - rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); -#if PLATFORM_WINDOWS - // Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not - // allocated unless/until the virtual addresses are actually accessed" - void *ptr = VirtualAlloc(0, size + padding, - (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | - MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); - if (!ptr) { - if (_memory_config.map_fail_callback) { - if (_memory_config.map_fail_callback(size + padding)) - return _rpmalloc_mmap_os(size, offset); - } else { - rpmalloc_assert(ptr, "Failed to map virtual memory block"); - } - return 0; - } -#else - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; -#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR - int fd = (int)VM_MAKE_TAG(240U); - if (_memory_huge_pages) - fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; - void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); -#elif defined(MAP_HUGETLB) - void *ptr = mmap(0, size + padding, - PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), - (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); -#if defined(MADV_HUGEPAGE) - // In some configurations, huge pages allocations might fail thus - // we fallback to normal allocations and promote the region as transparent - // huge page - if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) { - ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); - if (ptr && ptr != MAP_FAILED) { - int prm = madvise(ptr, size + padding, MADV_HUGEPAGE); - (void)prm; - rpmalloc_assert((prm == 0), "Failed to promote the page to THP"); - } - } -#endif - _rpmalloc_set_name(ptr, size + padding); -#elif defined(MAP_ALIGNED) - const size_t align = - (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1)); - void *ptr = - mmap(0, size + padding, PROT_READ | PROT_WRITE, - (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0); -#elif defined(MAP_ALIGN) - caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0); - void *ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, - (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0); -#else - void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); -#endif - if ((ptr == MAP_FAILED) || !ptr) { - if (_memory_config.map_fail_callback) { - if (_memory_config.map_fail_callback(size + padding)) - return _rpmalloc_mmap_os(size, offset); - } else if (errno != ENOMEM) { - rpmalloc_assert((ptr != MAP_FAILED) && ptr, - "Failed to map virtual memory block"); - } - return 0; - } -#endif - _rpmalloc_stat_add(&_mapped_pages_os, - (int32_t)((size + padding) >> _memory_page_size_shift)); - if (padding) { - size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); - rpmalloc_assert(final_padding <= _memory_span_size, - "Internal failure in padding"); - rpmalloc_assert(final_padding <= padding, "Internal failure in padding"); - rpmalloc_assert(!(final_padding % 8), "Internal failure in padding"); - ptr = pointer_offset(ptr, final_padding); - *offset = final_padding >> 3; - } - rpmalloc_assert((size < _memory_span_size) || - !((uintptr_t)ptr & ~_memory_span_mask), - "Internal failure in padding"); - return ptr; -} - -//! Default implementation to unmap pages from virtual memory -static void _rpmalloc_unmap_os(void *address, size_t size, size_t offset, - size_t release) { - rpmalloc_assert(release || (offset == 0), "Invalid unmap size"); - rpmalloc_assert(!release || (release >= _memory_page_size), - "Invalid unmap size"); - rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size"); - if (release && offset) { - offset <<= 3; - address = pointer_offset(address, -(int32_t)offset); - if ((release >= _memory_span_size) && - (_memory_span_size > _memory_map_granularity)) { - // Padding is always one span size - release += _memory_span_size; - } - } -#if !DISABLE_UNMAP -#if PLATFORM_WINDOWS - if (!VirtualFree(address, release ? 0 : size, - release ? MEM_RELEASE : MEM_DECOMMIT)) { - rpmalloc_assert(0, "Failed to unmap virtual memory block"); - } -#else - if (release) { - if (munmap(address, release)) { - rpmalloc_assert(0, "Failed to unmap virtual memory block"); - } - } else { -#if defined(MADV_FREE_REUSABLE) - int ret; - while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && - (errno == EAGAIN)) - errno = 0; - if ((ret == -1) && (errno != 0)) { -#elif defined(MADV_DONTNEED) - if (madvise(address, size, MADV_DONTNEED)) { -#elif defined(MADV_PAGEOUT) - if (madvise(address, size, MADV_PAGEOUT)) { -#elif defined(MADV_FREE) - if (madvise(address, size, MADV_FREE)) { -#else - if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { -#endif - rpmalloc_assert(0, "Failed to madvise virtual memory block as free"); - } - } -#endif -#endif - if (release) - _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift); -} - -static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, - span_t *subspan, - size_t span_count); - -//! Use global reserved spans to fulfill a memory map request (reserve size must -//! be checked by caller) -static span_t *_rpmalloc_global_get_reserved_spans(size_t span_count) { - span_t *span = _memory_global_reserve; - _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, - span, span_count); - _memory_global_reserve_count -= span_count; - if (_memory_global_reserve_count) - _memory_global_reserve = - (span_t *)pointer_offset(span, span_count << _memory_span_size_shift); - else - _memory_global_reserve = 0; - return span; -} - -//! Store the given spans as global reserve (must only be called from within new -//! heap allocation, not thread safe) -static void _rpmalloc_global_set_reserved_spans(span_t *master, span_t *reserve, - size_t reserve_span_count) { - _memory_global_reserve_master = master; - _memory_global_reserve_count = reserve_span_count; - _memory_global_reserve = reserve; -} - -//////////// -/// -/// Span linked list management -/// -////// - -//! Add a span to double linked list at the head -static void _rpmalloc_span_double_link_list_add(span_t **head, span_t *span) { - if (*head) - (*head)->prev = span; - span->next = *head; - *head = span; -} - -//! Pop head span from double linked list -static void _rpmalloc_span_double_link_list_pop_head(span_t **head, - span_t *span) { - rpmalloc_assert(*head == span, "Linked list corrupted"); - span = *head; - *head = span->next; -} - -//! Remove a span from double linked list -static void _rpmalloc_span_double_link_list_remove(span_t **head, - span_t *span) { - rpmalloc_assert(*head, "Linked list corrupted"); - if (*head == span) { - *head = span->next; - } else { - span_t *next_span = span->next; - span_t *prev_span = span->prev; - prev_span->next = next_span; - if (EXPECTED(next_span != 0)) - next_span->prev = prev_span; - } -} - -//////////// -/// -/// Span control -/// -////// - -static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span); - -static void _rpmalloc_heap_finalize(heap_t *heap); - -static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, - span_t *reserve, - size_t reserve_span_count); - -//! Declare the span to be a subspan and store distance from master span and -//! span count -static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, - span_t *subspan, - size_t span_count) { - rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), - "Span master pointer and/or flag mismatch"); - if (subspan != master) { - subspan->flags = SPAN_FLAG_SUBSPAN; - subspan->offset_from_master = - (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> - _memory_span_size_shift); - subspan->align_offset = 0; - } - subspan->span_count = (uint32_t)span_count; -} - -//! Use reserved spans to fulfill a memory map request (reserve size must be -//! checked by caller) -static span_t *_rpmalloc_span_map_from_reserve(heap_t *heap, - size_t span_count) { - // Update the heap span reserve - span_t *span = heap->span_reserve; - heap->span_reserve = - (span_t *)pointer_offset(span, span_count * _memory_span_size); - heap->spans_reserved -= (uint32_t)span_count; - - _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, - span_count); - if (span_count <= LARGE_CLASS_COUNT) - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved); - - return span; -} - -//! Get the aligned number of spans to map in based on wanted count, configured -//! mapping granularity and the page size -static size_t _rpmalloc_span_align_count(size_t span_count) { - size_t request_count = (span_count > _memory_span_map_count) - ? span_count - : _memory_span_map_count; - if ((_memory_page_size > _memory_span_size) && - ((request_count * _memory_span_size) % _memory_page_size)) - request_count += - _memory_span_map_count - (request_count % _memory_span_map_count); - return request_count; -} - -//! Setup a newly mapped span -static void _rpmalloc_span_initialize(span_t *span, size_t total_span_count, - size_t span_count, size_t align_offset) { - span->total_spans = (uint32_t)total_span_count; - span->span_count = (uint32_t)span_count; - span->align_offset = (uint32_t)align_offset; - span->flags = SPAN_FLAG_MASTER; - atomic_store32(&span->remaining_spans, (int32_t)total_span_count); -} - -static void _rpmalloc_span_unmap(span_t *span); - -//! Map an aligned set of spans, taking configured mapping granularity and the -//! page size into account -static span_t *_rpmalloc_span_map_aligned_count(heap_t *heap, - size_t span_count) { - // If we already have some, but not enough, reserved spans, release those to - // heap cache and map a new full set of spans. Otherwise we would waste memory - // if page size > span size (huge pages) - size_t aligned_span_count = _rpmalloc_span_align_count(span_count); - size_t align_offset = 0; - span_t *span = (span_t *)_rpmalloc_mmap( - aligned_span_count * _memory_span_size, &align_offset); - if (!span) - return 0; - _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset); - _rpmalloc_stat_inc(&_master_spans); - if (span_count <= LARGE_CLASS_COUNT) - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls); - if (aligned_span_count > span_count) { - span_t *reserved_spans = - (span_t *)pointer_offset(span, span_count * _memory_span_size); - size_t reserved_count = aligned_span_count - span_count; - if (heap->spans_reserved) { - _rpmalloc_span_mark_as_subspan_unless_master( - heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); - _rpmalloc_heap_cache_insert(heap, heap->span_reserve); - } - if (reserved_count > _memory_heap_reserve_count) { - // If huge pages or eager spam map count, the global reserve spin lock is - // held by caller, _rpmalloc_span_map - rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, - "Global spin lock not held as expected"); - size_t remain_count = reserved_count - _memory_heap_reserve_count; - reserved_count = _memory_heap_reserve_count; - span_t *remain_span = (span_t *)pointer_offset( - reserved_spans, reserved_count * _memory_span_size); - if (_memory_global_reserve) { - _rpmalloc_span_mark_as_subspan_unless_master( - _memory_global_reserve_master, _memory_global_reserve, - _memory_global_reserve_count); - _rpmalloc_span_unmap(_memory_global_reserve); - } - _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count); - } - _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, - reserved_count); - } - return span; -} - -//! Map in memory pages for the given number of spans (or use previously -//! reserved pages) -static span_t *_rpmalloc_span_map(heap_t *heap, size_t span_count) { - if (span_count <= heap->spans_reserved) - return _rpmalloc_span_map_from_reserve(heap, span_count); - span_t *span = 0; - int use_global_reserve = - (_memory_page_size > _memory_span_size) || - (_memory_span_map_count > _memory_heap_reserve_count); - if (use_global_reserve) { - // If huge pages, make sure only one thread maps more memory to avoid bloat - while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) - _rpmalloc_spin(); - if (_memory_global_reserve_count >= span_count) { - size_t reserve_count = - (!heap->spans_reserved ? _memory_heap_reserve_count : span_count); - if (_memory_global_reserve_count < reserve_count) - reserve_count = _memory_global_reserve_count; - span = _rpmalloc_global_get_reserved_spans(reserve_count); - if (span) { - if (reserve_count > span_count) { - span_t *reserved_span = (span_t *)pointer_offset( - span, span_count << _memory_span_size_shift); - _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, - reserved_span, - reserve_count - span_count); - } - // Already marked as subspan in _rpmalloc_global_get_reserved_spans - span->span_count = (uint32_t)span_count; - } - } - } - if (!span) - span = _rpmalloc_span_map_aligned_count(heap, span_count); - if (use_global_reserve) - atomic_store32_release(&_memory_global_lock, 0); - return span; -} - -//! Unmap memory pages for the given number of spans (or mark as unused if no -//! partial unmappings) -static void _rpmalloc_span_unmap(span_t *span) { - rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || - (span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || - !(span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - - int is_master = !!(span->flags & SPAN_FLAG_MASTER); - span_t *master = - is_master ? span - : ((span_t *)pointer_offset( - span, -(intptr_t)((uintptr_t)span->offset_from_master * - _memory_span_size))); - rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); - - size_t span_count = span->span_count; - if (!is_master) { - // Directly unmap subspans (unless huge pages, in which case we defer and - // unmap entire page range with master) - rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted"); - if (_memory_span_size >= _memory_page_size) - _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0); - } else { - // Special double flag to denote an unmapped master - // It must be kept in memory since span header must be used - span->flags |= - SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER; - _rpmalloc_stat_add(&_unmapped_master_spans, 1); - } - - if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { - // Everything unmapped, unmap the master span with release flag to unmap the - // entire range of the super span - rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && - !!(master->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - size_t unmap_count = master->span_count; - if (_memory_span_size < _memory_page_size) - unmap_count = master->total_spans; - _rpmalloc_stat_sub(&_master_spans, 1); - _rpmalloc_stat_sub(&_unmapped_master_spans, 1); - _rpmalloc_unmap(master, unmap_count * _memory_span_size, - master->align_offset, - (size_t)master->total_spans * _memory_span_size); - } -} - -//! Move the span (used for small or medium allocations) to the heap thread -//! cache -static void _rpmalloc_span_release_to_cache(heap_t *heap, span_t *span) { - rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted"); - rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, - "Invalid span size class"); - rpmalloc_assert(span->span_count == 1, "Invalid span count"); -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - atomic_decr32(&heap->span_use[0].current); -#endif - _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); - if (!heap->finalize) { - _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache); - _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache); - if (heap->size_class[span->size_class].cache) - _rpmalloc_heap_cache_insert(heap, - heap->size_class[span->size_class].cache); - heap->size_class[span->size_class].cache = span; - } else { - _rpmalloc_span_unmap(span); - } -} - -//! Initialize a (partial) free list up to next system memory page, while -//! reserving the first block as allocated, returning number of blocks in list -static uint32_t free_list_partial_init(void **list, void **first_block, - void *page_start, void *block_start, - uint32_t block_count, - uint32_t block_size) { - rpmalloc_assert(block_count, "Internal failure"); - *first_block = block_start; - if (block_count > 1) { - void *free_block = pointer_offset(block_start, block_size); - void *block_end = - pointer_offset(block_start, (size_t)block_size * block_count); - // If block size is less than half a memory page, bound init to next memory - // page boundary - if (block_size < (_memory_page_size >> 1)) { - void *page_end = pointer_offset(page_start, _memory_page_size); - if (page_end < block_end) - block_end = page_end; - } - *list = free_block; - block_count = 2; - void *next_block = pointer_offset(free_block, block_size); - while (next_block < block_end) { - *((void **)free_block) = next_block; - free_block = next_block; - ++block_count; - next_block = pointer_offset(next_block, block_size); - } - *((void **)free_block) = 0; - } else { - *list = 0; - } - return block_count; -} - -//! Initialize an unused span (from cache or mapped) to be new active span, -//! putting the initial free list in heap class free list -static void *_rpmalloc_span_initialize_new(heap_t *heap, - heap_size_class_t *heap_size_class, - span_t *span, uint32_t class_idx) { - rpmalloc_assert(span->span_count == 1, "Internal failure"); - size_class_t *size_class = _memory_size_class + class_idx; - span->size_class = class_idx; - span->heap = heap; - span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; - span->block_size = size_class->block_size; - span->block_count = size_class->block_count; - span->free_list = 0; - span->list_size = 0; - atomic_store_ptr_release(&span->free_list_deferred, 0); - - // Setup free list. Only initialize one system page worth of free blocks in - // list - void *block; - span->free_list_limit = - free_list_partial_init(&heap_size_class->free_list, &block, span, - pointer_offset(span, SPAN_HEADER_SIZE), - size_class->block_count, size_class->block_size); - // Link span as partial if there remains blocks to be initialized as free - // list, or full if fully initialized - if (span->free_list_limit < span->block_count) { - _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span); - span->used_count = span->free_list_limit; - } else { -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); -#endif - ++heap->full_span_count; - span->used_count = span->block_count; - } - return block; -} - -static void _rpmalloc_span_extract_free_list_deferred(span_t *span) { - // We need acquire semantics on the CAS operation since we are interested in - // the list size Refer to _rpmalloc_deallocate_defer_small_or_medium for - // further comments on this dependency - do { - span->free_list = - atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); - } while (span->free_list == INVALID_POINTER); - span->used_count -= span->list_size; - span->list_size = 0; - atomic_store_ptr_release(&span->free_list_deferred, 0); -} - -static int _rpmalloc_span_is_fully_utilized(span_t *span) { - rpmalloc_assert(span->free_list_limit <= span->block_count, - "Span free list corrupted"); - return !span->free_list && (span->free_list_limit >= span->block_count); -} - -static int _rpmalloc_span_finalize(heap_t *heap, size_t iclass, span_t *span, - span_t **list_head) { - void *free_list = heap->size_class[iclass].free_list; - span_t *class_span = (span_t *)((uintptr_t)free_list & _memory_span_mask); - if (span == class_span) { - // Adopt the heap class free list back into the span free list - void *block = span->free_list; - void *last_block = 0; - while (block) { - last_block = block; - block = *((void **)block); - } - uint32_t free_count = 0; - block = free_list; - while (block) { - ++free_count; - block = *((void **)block); - } - if (last_block) { - *((void **)last_block) = free_list; - } else { - span->free_list = free_list; - } - heap->size_class[iclass].free_list = 0; - span->used_count -= free_count; - } - // If this assert triggers you have memory leaks - rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected"); - if (span->list_size == span->used_count) { - _rpmalloc_stat_dec(&heap->span_use[0].current); - _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current); - // This function only used for spans in double linked lists - if (list_head) - _rpmalloc_span_double_link_list_remove(list_head, span); - _rpmalloc_span_unmap(span); - return 1; - } - return 0; -} - -//////////// -/// -/// Global cache -/// -////// - -#if ENABLE_GLOBAL_CACHE - -//! Finalize a global cache -static void _rpmalloc_global_cache_finalize(global_cache_t *cache) { - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - - for (size_t ispan = 0; ispan < cache->count; ++ispan) - _rpmalloc_span_unmap(cache->span[ispan]); - cache->count = 0; - - while (cache->overflow) { - span_t *span = cache->overflow; - cache->overflow = span->next; - _rpmalloc_span_unmap(span); - } - - atomic_store32_release(&cache->lock, 0); -} - -static void _rpmalloc_global_cache_insert_spans(span_t **span, - size_t span_count, - size_t count) { - const size_t cache_limit = - (span_count == 1) ? GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE - : GLOBAL_CACHE_MULTIPLIER * - (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); - - global_cache_t *cache = &_memory_span_cache[span_count - 1]; - - size_t insert_count = count; - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - -#if ENABLE_STATISTICS - cache->insert_count += count; -#endif - if ((cache->count + insert_count) > cache_limit) - insert_count = cache_limit - cache->count; - - memcpy(cache->span + cache->count, span, sizeof(span_t *) * insert_count); - cache->count += (uint32_t)insert_count; - -#if ENABLE_UNLIMITED_CACHE - while (insert_count < count) { -#else - // Enable unlimited cache if huge pages, or we will leak since it is unlikely - // that an entire huge page will be unmapped, and we're unable to partially - // decommit a huge page - while ((_memory_page_size > _memory_span_size) && (insert_count < count)) { -#endif - span_t *current_span = span[insert_count++]; - current_span->next = cache->overflow; - cache->overflow = current_span; - } - atomic_store32_release(&cache->lock, 0); - - span_t *keep = 0; - for (size_t ispan = insert_count; ispan < count; ++ispan) { - span_t *current_span = span[ispan]; - // Keep master spans that has remaining subspans to avoid dangling them - if ((current_span->flags & SPAN_FLAG_MASTER) && - (atomic_load32(¤t_span->remaining_spans) > - (int32_t)current_span->span_count)) { - current_span->next = keep; - keep = current_span; - } else { - _rpmalloc_span_unmap(current_span); - } - } - - if (keep) { - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - - size_t islot = 0; - while (keep) { - for (; islot < cache->count; ++islot) { - span_t *current_span = cache->span[islot]; - if (!(current_span->flags & SPAN_FLAG_MASTER) || - ((current_span->flags & SPAN_FLAG_MASTER) && - (atomic_load32(¤t_span->remaining_spans) <= - (int32_t)current_span->span_count))) { - _rpmalloc_span_unmap(current_span); - cache->span[islot] = keep; - break; - } - } - if (islot == cache->count) - break; - keep = keep->next; - } - - if (keep) { - span_t *tail = keep; - while (tail->next) - tail = tail->next; - tail->next = cache->overflow; - cache->overflow = keep; - } - - atomic_store32_release(&cache->lock, 0); - } -} - -static size_t _rpmalloc_global_cache_extract_spans(span_t **span, - size_t span_count, - size_t count) { - global_cache_t *cache = &_memory_span_cache[span_count - 1]; - - size_t extract_count = 0; - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - -#if ENABLE_STATISTICS - cache->extract_count += count; -#endif - size_t want = count - extract_count; - if (want > cache->count) - want = cache->count; - - memcpy(span + extract_count, cache->span + (cache->count - want), - sizeof(span_t *) * want); - cache->count -= (uint32_t)want; - extract_count += want; - - while ((extract_count < count) && cache->overflow) { - span_t *current_span = cache->overflow; - span[extract_count++] = current_span; - cache->overflow = current_span->next; - } - -#if ENABLE_ASSERTS - for (size_t ispan = 0; ispan < extract_count; ++ispan) { - rpmalloc_assert(span[ispan]->span_count == span_count, - "Global cache span count mismatch"); - } -#endif - - atomic_store32_release(&cache->lock, 0); - - return extract_count; -} - -#endif - -//////////// -/// -/// Heap control -/// -////// - -static void _rpmalloc_deallocate_huge(span_t *); - -//! Store the given spans as reserve in the given heap -static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, - span_t *reserve, - size_t reserve_span_count) { - heap->span_reserve_master = master; - heap->span_reserve = reserve; - heap->spans_reserved = (uint32_t)reserve_span_count; -} - -//! Adopt the deferred span cache list, optionally extracting the first single -//! span for immediate re-use -static void _rpmalloc_heap_cache_adopt_deferred(heap_t *heap, - span_t **single_span) { - span_t *span = (span_t *)((void *)atomic_exchange_ptr_acquire( - &heap->span_free_deferred, 0)); - while (span) { - span_t *next_span = (span_t *)span->free_list; - rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted"); - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { - rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); - --heap->full_span_count; - _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred); -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], - span); -#endif - _rpmalloc_stat_dec(&heap->span_use[0].current); - _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); - if (single_span && !*single_span) - *single_span = span; - else - _rpmalloc_heap_cache_insert(heap, span); - } else { - if (span->size_class == SIZE_CLASS_HUGE) { - _rpmalloc_deallocate_huge(span); - } else { - rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, - "Span size class invalid"); - rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); - --heap->full_span_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span); -#endif - uint32_t idx = span->span_count - 1; - _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred); - _rpmalloc_stat_dec(&heap->span_use[idx].current); - if (!idx && single_span && !*single_span) - *single_span = span; - else - _rpmalloc_heap_cache_insert(heap, span); - } - } - span = next_span; - } -} - -static void _rpmalloc_heap_unmap(heap_t *heap) { - if (!heap->master_heap) { - if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) { - span_t *span = (span_t *)((uintptr_t)heap & _memory_span_mask); - _rpmalloc_span_unmap(span); - } - } else { - if (atomic_decr32(&heap->master_heap->child_count) == 0) { - _rpmalloc_heap_unmap(heap->master_heap); - } - } -} - -static void _rpmalloc_heap_global_finalize(heap_t *heap) { - if (heap->finalize++ > 1) { - --heap->finalize; - return; - } - - _rpmalloc_heap_finalize(heap); - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); - span_cache->count = 0; - } -#endif - - if (heap->full_span_count) { - --heap->finalize; - return; - } - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (heap->size_class[iclass].free_list || - heap->size_class[iclass].partial_span) { - --heap->finalize; - return; - } - } - // Heap is now completely free, unmap and remove from heap list - size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; - heap_t *list_heap = _memory_heaps[list_idx]; - if (list_heap == heap) { - _memory_heaps[list_idx] = heap->next_heap; - } else { - while (list_heap->next_heap != heap) - list_heap = list_heap->next_heap; - list_heap->next_heap = heap->next_heap; - } - - _rpmalloc_heap_unmap(heap); -} - -//! Insert a single span into thread heap cache, releasing to global cache if -//! overflow -static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span) { - if (UNEXPECTED(heap->finalize != 0)) { - _rpmalloc_span_unmap(span); - _rpmalloc_heap_global_finalize(heap); - return; - } -#if ENABLE_THREAD_CACHE - size_t span_count = span->span_count; - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache); - if (span_count == 1) { - span_cache_t *span_cache = &heap->span_cache; - span_cache->span[span_cache->count++] = span; - if (span_cache->count == MAX_THREAD_SPAN_CACHE) { - const size_t remain_count = - MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER; -#if ENABLE_GLOBAL_CACHE - _rpmalloc_stat_add64(&heap->thread_to_global, - THREAD_SPAN_CACHE_TRANSFER * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, - THREAD_SPAN_CACHE_TRANSFER); - _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, - span_count, - THREAD_SPAN_CACHE_TRANSFER); -#else - for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan) - _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); -#endif - span_cache->count = remain_count; - } - } else { - size_t cache_idx = span_count - 2; - span_large_cache_t *span_cache = heap->span_large_cache + cache_idx; - span_cache->span[span_cache->count++] = span; - const size_t cache_limit = - (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); - if (span_cache->count == cache_limit) { - const size_t transfer_limit = 2 + (cache_limit >> 2); - const size_t transfer_count = - (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit - ? THREAD_SPAN_LARGE_CACHE_TRANSFER - : transfer_limit); - const size_t remain_count = cache_limit - transfer_count; -#if ENABLE_GLOBAL_CACHE - _rpmalloc_stat_add64(&heap->thread_to_global, - transfer_count * span_count * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, - transfer_count); - _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, - span_count, transfer_count); -#else - for (size_t ispan = 0; ispan < transfer_count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); -#endif - span_cache->count = remain_count; - } - } -#else - (void)sizeof(heap); - _rpmalloc_span_unmap(span); -#endif -} - -//! Extract the given number of spans from the different cache levels -static span_t *_rpmalloc_heap_thread_cache_extract(heap_t *heap, - size_t span_count) { - span_t *span = 0; -#if ENABLE_THREAD_CACHE - span_cache_t *span_cache; - if (span_count == 1) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); - if (span_cache->count) { - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache); - return span_cache->span[--span_cache->count]; - } -#endif - return span; -} - -static span_t *_rpmalloc_heap_thread_cache_deferred_extract(heap_t *heap, - size_t span_count) { - span_t *span = 0; - if (span_count == 1) { - _rpmalloc_heap_cache_adopt_deferred(heap, &span); - } else { - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - span = _rpmalloc_heap_thread_cache_extract(heap, span_count); - } - return span; -} - -static span_t *_rpmalloc_heap_reserved_extract(heap_t *heap, - size_t span_count) { - if (heap->spans_reserved >= span_count) - return _rpmalloc_span_map(heap, span_count); - return 0; -} - -//! Extract a span from the global cache -static span_t *_rpmalloc_heap_global_cache_extract(heap_t *heap, - size_t span_count) { -#if ENABLE_GLOBAL_CACHE -#if ENABLE_THREAD_CACHE - span_cache_t *span_cache; - size_t wanted_count; - if (span_count == 1) { - span_cache = &heap->span_cache; - wanted_count = THREAD_SPAN_CACHE_TRANSFER; - } else { - span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); - wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER; - } - span_cache->count = _rpmalloc_global_cache_extract_spans( - span_cache->span, span_count, wanted_count); - if (span_cache->count) { - _rpmalloc_stat_add64(&heap->global_to_thread, - span_count * span_cache->count * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, - span_cache->count); - return span_cache->span[--span_cache->count]; - } -#else - span_t *span = 0; - size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1); - if (count) { - _rpmalloc_stat_add64(&heap->global_to_thread, - span_count * count * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, - count); - return span; - } -#endif -#endif - (void)sizeof(heap); - (void)sizeof(span_count); - return 0; -} - -static void _rpmalloc_inc_span_statistics(heap_t *heap, size_t span_count, - uint32_t class_idx) { - (void)sizeof(heap); - (void)sizeof(span_count); - (void)sizeof(class_idx); -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - uint32_t idx = (uint32_t)span_count - 1; - uint32_t current_count = - (uint32_t)atomic_incr32(&heap->span_use[idx].current); - if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high)) - atomic_store32(&heap->span_use[idx].high, (int32_t)current_count); - _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, - heap->size_class_use[class_idx].spans_peak); -#endif -} - -//! Get a span from one of the cache levels (thread cache, reserved, global -//! cache) or fallback to mapping more memory -static span_t * -_rpmalloc_heap_extract_new_span(heap_t *heap, - heap_size_class_t *heap_size_class, - size_t span_count, uint32_t class_idx) { - span_t *span; -#if ENABLE_THREAD_CACHE - if (heap_size_class && heap_size_class->cache) { - span = heap_size_class->cache; - heap_size_class->cache = - (heap->span_cache.count - ? heap->span_cache.span[--heap->span_cache.count] - : 0); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } -#endif - (void)sizeof(class_idx); - // Allow 50% overhead to increase cache hits - size_t base_span_count = span_count; - size_t limit_span_count = - (span_count > 2) ? (span_count + (span_count >> 1)) : span_count; - if (limit_span_count > LARGE_CLASS_COUNT) - limit_span_count = LARGE_CLASS_COUNT; - do { - span = _rpmalloc_heap_thread_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - span = _rpmalloc_heap_global_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - span = _rpmalloc_heap_reserved_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - ++span_count; - } while (span_count <= limit_span_count); - // Final fallback, map in more virtual memory - span = _rpmalloc_span_map(heap, base_span_count); - _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx); - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls); - return span; -} - -static void _rpmalloc_heap_initialize(heap_t *heap) { - _rpmalloc_memset_const(heap, 0, sizeof(heap_t)); - // Get a new heap ID - heap->id = 1 + atomic_incr32(&_memory_heap_id); - - // Link in heap in heap ID map - size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; - heap->next_heap = _memory_heaps[list_idx]; - _memory_heaps[list_idx] = heap; -} - -static void _rpmalloc_heap_orphan(heap_t *heap, int first_class) { - heap->owner_thread = (uintptr_t)-1; -#if RPMALLOC_FIRST_CLASS_HEAPS - heap_t **heap_list = - (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps); -#else - (void)sizeof(first_class); - heap_t **heap_list = &_memory_orphan_heaps; -#endif - heap->next_orphan = *heap_list; - *heap_list = heap; -} - -//! Allocate a new heap from newly mapped memory pages -static heap_t *_rpmalloc_heap_allocate_new(void) { - // Map in pages for a 16 heaps. If page size is greater than required size for - // this, map a page and use first part for heaps and remaining part for spans - // for allocations. Adds a lot of complexity, but saves a lot of memory on - // systems where page size > 64 spans (4MiB) - size_t heap_size = sizeof(heap_t); - size_t aligned_heap_size = 16 * ((heap_size + 15) / 16); - size_t request_heap_count = 16; - size_t heap_span_count = ((aligned_heap_size * request_heap_count) + - sizeof(span_t) + _memory_span_size - 1) / - _memory_span_size; - size_t block_size = _memory_span_size * heap_span_count; - size_t span_count = heap_span_count; - span_t *span = 0; - // If there are global reserved spans, use these first - if (_memory_global_reserve_count >= heap_span_count) { - span = _rpmalloc_global_get_reserved_spans(heap_span_count); - } - if (!span) { - if (_memory_page_size > block_size) { - span_count = _memory_page_size / _memory_span_size; - block_size = _memory_page_size; - // If using huge pages, make sure to grab enough heaps to avoid - // reallocating a huge page just to serve new heaps - size_t possible_heap_count = - (block_size - sizeof(span_t)) / aligned_heap_size; - if (possible_heap_count >= (request_heap_count * 16)) - request_heap_count *= 16; - else if (possible_heap_count < request_heap_count) - request_heap_count = possible_heap_count; - heap_span_count = ((aligned_heap_size * request_heap_count) + - sizeof(span_t) + _memory_span_size - 1) / - _memory_span_size; - } - - size_t align_offset = 0; - span = (span_t *)_rpmalloc_mmap(block_size, &align_offset); - if (!span) - return 0; - - // Master span will contain the heaps - _rpmalloc_stat_inc(&_master_spans); - _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset); - } - - size_t remain_size = _memory_span_size - sizeof(span_t); - heap_t *heap = (heap_t *)pointer_offset(span, sizeof(span_t)); - _rpmalloc_heap_initialize(heap); - - // Put extra heaps as orphans - size_t num_heaps = remain_size / aligned_heap_size; - if (num_heaps < request_heap_count) - num_heaps = request_heap_count; - atomic_store32(&heap->child_count, (int32_t)num_heaps - 1); - heap_t *extra_heap = (heap_t *)pointer_offset(heap, aligned_heap_size); - while (num_heaps > 1) { - _rpmalloc_heap_initialize(extra_heap); - extra_heap->master_heap = heap; - _rpmalloc_heap_orphan(extra_heap, 1); - extra_heap = (heap_t *)pointer_offset(extra_heap, aligned_heap_size); - --num_heaps; - } - - if (span_count > heap_span_count) { - // Cap reserved spans - size_t remain_count = span_count - heap_span_count; - size_t reserve_count = - (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count - : remain_count); - span_t *remain_span = - (span_t *)pointer_offset(span, heap_span_count * _memory_span_size); - _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count); - - if (remain_count > reserve_count) { - // Set to global reserved spans - remain_span = (span_t *)pointer_offset(remain_span, - reserve_count * _memory_span_size); - reserve_count = remain_count - reserve_count; - _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count); - } - } - - return heap; -} - -static heap_t *_rpmalloc_heap_extract_orphan(heap_t **heap_list) { - heap_t *heap = *heap_list; - *heap_list = (heap ? heap->next_orphan : 0); - return heap; -} - -//! Allocate a new heap, potentially reusing a previously orphaned heap -static heap_t *_rpmalloc_heap_allocate(int first_class) { - heap_t *heap = 0; - while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) - _rpmalloc_spin(); - if (first_class == 0) - heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps); -#if RPMALLOC_FIRST_CLASS_HEAPS - if (!heap) - heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps); -#endif - if (!heap) - heap = _rpmalloc_heap_allocate_new(); - atomic_store32_release(&_memory_global_lock, 0); - if (heap) - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - return heap; -} - -static void _rpmalloc_heap_release(void *heapptr, int first_class, - int release_cache) { - heap_t *heap = (heap_t *)heapptr; - if (!heap) - return; - // Release thread cache spans back to global cache - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - if (release_cache || heap->finalize) { -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - if (!span_cache->count) - continue; -#if ENABLE_GLOBAL_CACHE - if (heap->finalize) { - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); - } else { - _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * - (iclass + 1) * - _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, - span_cache->count); - _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, - span_cache->count); - } -#else - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); -#endif - span_cache->count = 0; - } -#endif - } - - if (get_thread_heap_raw() == heap) - set_thread_heap(0); - -#if ENABLE_STATISTICS - atomic_decr32(&_memory_active_heaps); - rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, - "Still active heaps during finalization"); -#endif - - // If we are forcibly terminating with _exit the state of the - // lock atomic is unknown and it's best to just go ahead and exit - if (get_thread_id() != _rpmalloc_main_thread_id) { - while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) - _rpmalloc_spin(); - } - _rpmalloc_heap_orphan(heap, first_class); - atomic_store32_release(&_memory_global_lock, 0); -} - -static void _rpmalloc_heap_release_raw(void *heapptr, int release_cache) { - _rpmalloc_heap_release(heapptr, 0, release_cache); -} - -static void _rpmalloc_heap_release_raw_fc(void *heapptr) { - _rpmalloc_heap_release_raw(heapptr, 1); -} - -static void _rpmalloc_heap_finalize(heap_t *heap) { - if (heap->spans_reserved) { - span_t *span = _rpmalloc_span_map(heap, heap->spans_reserved); - _rpmalloc_span_unmap(span); - heap->spans_reserved = 0; - } - - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (heap->size_class[iclass].cache) - _rpmalloc_span_unmap(heap->size_class[iclass].cache); - heap->size_class[iclass].cache = 0; - span_t *span = heap->size_class[iclass].partial_span; - while (span) { - span_t *next = span->next; - _rpmalloc_span_finalize(heap, iclass, span, - &heap->size_class[iclass].partial_span); - span = next; - } - // If class still has a free list it must be a full span - if (heap->size_class[iclass].free_list) { - span_t *class_span = - (span_t *)((uintptr_t)heap->size_class[iclass].free_list & - _memory_span_mask); - span_t **list = 0; -#if RPMALLOC_FIRST_CLASS_HEAPS - list = &heap->full_span[iclass]; -#endif - --heap->full_span_count; - if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) { - if (list) - _rpmalloc_span_double_link_list_remove(list, class_span); - _rpmalloc_span_double_link_list_add( - &heap->size_class[iclass].partial_span, class_span); - } - } - } - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); - span_cache->count = 0; - } -#endif - rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), - "Heaps still active during finalization"); -} - -//////////// -/// -/// Allocation entry points -/// -////// - -//! Pop first block from a free list -static void *free_list_pop(void **list) { - void *block = *list; - *list = *((void **)block); - return block; -} - -//! Allocate a small/medium sized memory block from the given heap -static void *_rpmalloc_allocate_from_heap_fallback( - heap_t *heap, heap_size_class_t *heap_size_class, uint32_t class_idx) { - span_t *span = heap_size_class->partial_span; - rpmalloc_assume(heap != 0); - if (EXPECTED(span != 0)) { - rpmalloc_assert(span->block_count == - _memory_size_class[span->size_class].block_count, - "Span block count corrupted"); - rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), - "Internal failure"); - void *block; - if (span->free_list) { - // Span local free list is not empty, swap to size class free list - block = free_list_pop(&span->free_list); - heap_size_class->free_list = span->free_list; - span->free_list = 0; - } else { - // If the span did not fully initialize free list, link up another page - // worth of blocks - void *block_start = pointer_offset( - span, SPAN_HEADER_SIZE + - ((size_t)span->free_list_limit * span->block_size)); - span->free_list_limit += free_list_partial_init( - &heap_size_class->free_list, &block, - (void *)((uintptr_t)block_start & ~(_memory_page_size - 1)), - block_start, span->block_count - span->free_list_limit, - span->block_size); - } - rpmalloc_assert(span->free_list_limit <= span->block_count, - "Span block count corrupted"); - span->used_count = span->free_list_limit; - - // Swap in deferred free list if present - if (atomic_load_ptr(&span->free_list_deferred)) - _rpmalloc_span_extract_free_list_deferred(span); - - // If span is still not fully utilized keep it in partial list and early - // return block - if (!_rpmalloc_span_is_fully_utilized(span)) - return block; - - // The span is fully utilized, unlink from partial list and add to fully - // utilized list - _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, - span); -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); -#endif - ++heap->full_span_count; - return block; - } - - // Find a span in one of the cache levels - span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx); - if (EXPECTED(span != 0)) { - // Mark span as owned by this heap and set base data, return first block - return _rpmalloc_span_initialize_new(heap, heap_size_class, span, - class_idx); - } - - return 0; -} - -//! Allocate a small sized memory block from the given heap -static void *_rpmalloc_allocate_small(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - // Small sizes have unique size classes - const uint32_t class_idx = - (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); - heap_size_class_t *heap_size_class = heap->size_class + class_idx; - _rpmalloc_stat_inc_alloc(heap, class_idx); - if (EXPECTED(heap_size_class->free_list != 0)) - return free_list_pop(&heap_size_class->free_list); - return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, - class_idx); -} - -//! Allocate a medium sized memory block from the given heap -static void *_rpmalloc_allocate_medium(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - // Calculate the size class index and do a dependent lookup of the final class - // index (in case of merged classes) - const uint32_t base_idx = - (uint32_t)(SMALL_CLASS_COUNT + - ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); - const uint32_t class_idx = _memory_size_class[base_idx].class_idx; - heap_size_class_t *heap_size_class = heap->size_class + class_idx; - _rpmalloc_stat_inc_alloc(heap, class_idx); - if (EXPECTED(heap_size_class->free_list != 0)) - return free_list_pop(&heap_size_class->free_list); - return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, - class_idx); -} - -//! Allocate a large sized memory block from the given heap -static void *_rpmalloc_allocate_large(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - // Calculate number of needed max sized spans (including header) - // Since this function is never called if size > LARGE_SIZE_LIMIT - // the span_count is guaranteed to be <= LARGE_CLASS_COUNT - size += SPAN_HEADER_SIZE; - size_t span_count = size >> _memory_span_size_shift; - if (size & (_memory_span_size - 1)) - ++span_count; - - // Find a span in one of the cache levels - span_t *span = - _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE); - if (!span) - return span; - - // Mark span as owned by this heap and set base data - rpmalloc_assert(span->span_count >= span_count, "Internal failure"); - span->size_class = SIZE_CLASS_LARGE; - span->heap = heap; - -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); -#endif - ++heap->full_span_count; - - return pointer_offset(span, SPAN_HEADER_SIZE); -} - -//! Allocate a huge block by mapping memory pages directly -static void *_rpmalloc_allocate_huge(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - size += SPAN_HEADER_SIZE; - size_t num_pages = size >> _memory_page_size_shift; - if (size & (_memory_page_size - 1)) - ++num_pages; - size_t align_offset = 0; - span_t *span = - (span_t *)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset); - if (!span) - return span; - - // Store page count in span_count - span->size_class = SIZE_CLASS_HUGE; - span->span_count = (uint32_t)num_pages; - span->align_offset = (uint32_t)align_offset; - span->heap = heap; - _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); - -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); -#endif - ++heap->full_span_count; - - return pointer_offset(span, SPAN_HEADER_SIZE); -} - -//! Allocate a block of the given size -static void *_rpmalloc_allocate(heap_t *heap, size_t size) { - _rpmalloc_stat_add64(&_allocation_counter, 1); - if (EXPECTED(size <= SMALL_SIZE_LIMIT)) - return _rpmalloc_allocate_small(heap, size); - else if (size <= _memory_medium_size_limit) - return _rpmalloc_allocate_medium(heap, size); - else if (size <= LARGE_SIZE_LIMIT) - return _rpmalloc_allocate_large(heap, size); - return _rpmalloc_allocate_huge(heap, size); -} - -static void *_rpmalloc_aligned_allocate(heap_t *heap, size_t alignment, - size_t size) { - if (alignment <= SMALL_GRANULARITY) - return _rpmalloc_allocate(heap, size); - -#if ENABLE_VALIDATE_ARGS - if ((size + alignment) < size) { - errno = EINVAL; - return 0; - } - if (alignment & (alignment - 1)) { - errno = EINVAL; - return 0; - } -#endif - - if ((alignment <= SPAN_HEADER_SIZE) && - ((size + SPAN_HEADER_SIZE) < _memory_medium_size_limit)) { - // If alignment is less or equal to span header size (which is power of - // two), and size aligned to span header size multiples is less than size + - // alignment, then use natural alignment of blocks to provide alignment - size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & - ~(uintptr_t)(SPAN_HEADER_SIZE - 1) - : SPAN_HEADER_SIZE; - rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), - "Failed alignment calculation"); - if (multiple_size <= (size + alignment)) - return _rpmalloc_allocate(heap, multiple_size); - } - - void *ptr = 0; - size_t align_mask = alignment - 1; - if (alignment <= _memory_page_size) { - ptr = _rpmalloc_allocate(heap, size + alignment); - if ((uintptr_t)ptr & align_mask) { - ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); - // Mark as having aligned blocks - span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); - span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; - } - return ptr; - } - - // Fallback to mapping new pages for this request. Since pointers passed - // to rpfree must be able to reach the start of the span by bitmasking of - // the address with the span size, the returned aligned pointer from this - // function must be with a span size of the start of the mapped area. - // In worst case this requires us to loop and map pages until we get a - // suitable memory address. It also means we can never align to span size - // or greater, since the span header will push alignment more than one - // span size away from span start (thus causing pointer mask to give us - // an invalid span start on free) - if (alignment & align_mask) { - errno = EINVAL; - return 0; - } - if (alignment >= _memory_span_size) { - errno = EINVAL; - return 0; - } - - size_t extra_pages = alignment / _memory_page_size; - - // Since each span has a header, we will at least need one extra memory page - size_t num_pages = 1 + (size / _memory_page_size); - if (size & (_memory_page_size - 1)) - ++num_pages; - - if (extra_pages > num_pages) - num_pages = 1 + extra_pages; - - size_t original_pages = num_pages; - size_t limit_pages = (_memory_span_size / _memory_page_size) * 2; - if (limit_pages < (original_pages * 2)) - limit_pages = original_pages * 2; - - size_t mapped_size, align_offset; - span_t *span; - -retry: - align_offset = 0; - mapped_size = num_pages * _memory_page_size; - - span = (span_t *)_rpmalloc_mmap(mapped_size, &align_offset); - if (!span) { - errno = ENOMEM; - return 0; - } - ptr = pointer_offset(span, SPAN_HEADER_SIZE); - - if ((uintptr_t)ptr & align_mask) - ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); - - if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || - (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || - (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { - _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size); - ++num_pages; - if (num_pages > limit_pages) { - errno = EINVAL; - return 0; - } - goto retry; - } - - // Store page count in span_count - span->size_class = SIZE_CLASS_HUGE; - span->span_count = (uint32_t)num_pages; - span->align_offset = (uint32_t)align_offset; - span->heap = heap; - _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); - -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); -#endif - ++heap->full_span_count; - - _rpmalloc_stat_add64(&_allocation_counter, 1); - - return ptr; -} - -//////////// -/// -/// Deallocation entry points -/// -////// - -//! Deallocate the given small/medium memory block in the current thread local -//! heap -static void _rpmalloc_deallocate_direct_small_or_medium(span_t *span, - void *block) { - heap_t *heap = span->heap; - rpmalloc_assert(heap->owner_thread == get_thread_id() || - !heap->owner_thread || heap->finalize, - "Internal failure"); - // Add block to free list - if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) { - span->used_count = span->block_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], - span); -#endif - _rpmalloc_span_double_link_list_add( - &heap->size_class[span->size_class].partial_span, span); - --heap->full_span_count; - } - *((void **)block) = span->free_list; - --span->used_count; - span->free_list = block; - if (UNEXPECTED(span->used_count == span->list_size)) { - // If there are no used blocks it is guaranteed that no other external - // thread is accessing the span - if (span->used_count) { - // Make sure we have synchronized the deferred list and list size by using - // acquire semantics and guarantee that no external thread is accessing - // span concurrently - void *free_list; - do { - free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, - INVALID_POINTER); - } while (free_list == INVALID_POINTER); - atomic_store_ptr_release(&span->free_list_deferred, free_list); - } - _rpmalloc_span_double_link_list_remove( - &heap->size_class[span->size_class].partial_span, span); - _rpmalloc_span_release_to_cache(heap, span); - } -} - -static void _rpmalloc_deallocate_defer_free_span(heap_t *heap, span_t *span) { - if (span->size_class != SIZE_CLASS_HUGE) - _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred); - // This list does not need ABA protection, no mutable side state - do { - span->free_list = (void *)atomic_load_ptr(&heap->span_free_deferred); - } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list)); -} - -//! Put the block in the deferred free list of the owning span -static void _rpmalloc_deallocate_defer_small_or_medium(span_t *span, - void *block) { - // The memory ordering here is a bit tricky, to avoid having to ABA protect - // the deferred free list to avoid desynchronization of list and list size - // we need to have acquire semantics on successful CAS of the pointer to - // guarantee the list_size variable validity + release semantics on pointer - // store - void *free_list; - do { - free_list = - atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); - } while (free_list == INVALID_POINTER); - *((void **)block) = free_list; - uint32_t free_count = ++span->list_size; - int all_deferred_free = (free_count == span->block_count); - atomic_store_ptr_release(&span->free_list_deferred, block); - if (all_deferred_free) { - // Span was completely freed by this block. Due to the INVALID_POINTER spin - // lock no other thread can reach this state simultaneously on this span. - // Safe to move to owner heap deferred cache - _rpmalloc_deallocate_defer_free_span(span->heap, span); - } -} - -static void _rpmalloc_deallocate_small_or_medium(span_t *span, void *p) { - _rpmalloc_stat_inc_free(span->heap, span->size_class); - if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { - // Realign pointer to block start - void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); - } - // Check if block belongs to this heap or if deallocation should be deferred -#if RPMALLOC_FIRST_CLASS_HEAPS - int defer = - (span->heap->owner_thread && - (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#else - int defer = - ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#endif - if (!defer) - _rpmalloc_deallocate_direct_small_or_medium(span, p); - else - _rpmalloc_deallocate_defer_small_or_medium(span, p); -} - -//! Deallocate the given large memory block to the current heap -static void _rpmalloc_deallocate_large(span_t *span) { - rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class"); - rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || - !(span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || - (span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - // We must always defer (unless finalizing) if from another heap since we - // cannot touch the list or counters of another heap -#if RPMALLOC_FIRST_CLASS_HEAPS - int defer = - (span->heap->owner_thread && - (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#else - int defer = - ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#endif - if (defer) { - _rpmalloc_deallocate_defer_free_span(span->heap, span); - return; - } - rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); - --span->heap->full_span_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); -#endif -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - // Decrease counter - size_t idx = span->span_count - 1; - atomic_decr32(&span->heap->span_use[idx].current); -#endif - heap_t *heap = span->heap; - rpmalloc_assert(heap, "No thread heap"); -#if ENABLE_THREAD_CACHE - const int set_as_reserved = - ((span->span_count > 1) && (heap->span_cache.count == 0) && - !heap->finalize && !heap->spans_reserved); -#else - const int set_as_reserved = - ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved); -#endif - if (set_as_reserved) { - heap->span_reserve = span; - heap->spans_reserved = span->span_count; - if (span->flags & SPAN_FLAG_MASTER) { - heap->span_reserve_master = span; - } else { // SPAN_FLAG_SUBSPAN - span_t *master = (span_t *)pointer_offset( - span, - -(intptr_t)((size_t)span->offset_from_master * _memory_span_size)); - heap->span_reserve_master = master; - rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); - rpmalloc_assert(atomic_load32(&master->remaining_spans) >= - (int32_t)span->span_count, - "Master span count corrupted"); - } - _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved); - } else { - // Insert into cache list - _rpmalloc_heap_cache_insert(heap, span); - } -} - -//! Deallocate the given huge span -static void _rpmalloc_deallocate_huge(span_t *span) { - rpmalloc_assert(span->heap, "No span heap"); -#if RPMALLOC_FIRST_CLASS_HEAPS - int defer = - (span->heap->owner_thread && - (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#else - int defer = - ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#endif - if (defer) { - _rpmalloc_deallocate_defer_free_span(span->heap, span); - return; - } - rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); - --span->heap->full_span_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); -#endif - - // Oversized allocation, page count is stored in span_count - size_t num_pages = span->span_count; - _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, - num_pages * _memory_page_size); - _rpmalloc_stat_sub(&_huge_pages_current, num_pages); -} - -//! Deallocate the given block -static void _rpmalloc_deallocate(void *p) { - _rpmalloc_stat_add64(&_deallocation_counter, 1); - // Grab the span (always at start of span, using span alignment) - span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); - if (UNEXPECTED(!span)) - return; - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) - _rpmalloc_deallocate_small_or_medium(span, p); - else if (span->size_class == SIZE_CLASS_LARGE) - _rpmalloc_deallocate_large(span); - else - _rpmalloc_deallocate_huge(span); -} - -//////////// -/// -/// Reallocation entry points -/// -////// - -static size_t _rpmalloc_usable_size(void *p); - -//! Reallocate the given block to the given size -static void *_rpmalloc_reallocate(heap_t *heap, void *p, size_t size, - size_t oldsize, unsigned int flags) { - if (p) { - // Grab the span using guaranteed span alignment - span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { - // Small/medium sized block - rpmalloc_assert(span->span_count == 1, "Span counter corrupted"); - void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - uint32_t block_idx = block_offset / span->block_size; - void *block = - pointer_offset(blocks_start, (size_t)block_idx * span->block_size); - if (!oldsize) - oldsize = - (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block)); - if ((size_t)span->block_size >= size) { - // Still fits in block, never mind trying to save memory, but preserve - // data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } else if (span->size_class == SIZE_CLASS_LARGE) { - // Large block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_spans = total_size >> _memory_span_size_shift; - if (total_size & (_memory_span_mask - 1)) - ++num_spans; - size_t current_spans = span->span_count; - void *block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_spans * _memory_span_size) - - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) { - // Still fits in block, never mind trying to save memory, but preserve - // data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } else { - // Oversized block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_pages = total_size >> _memory_page_size_shift; - if (total_size & (_memory_page_size - 1)) - ++num_pages; - // Page count is stored in span_count - size_t current_pages = span->span_count; - void *block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_pages * _memory_page_size) - - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { - // Still fits in block, never mind trying to save memory, but preserve - // data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } - } else { - oldsize = 0; - } - - if (!!(flags & RPMALLOC_GROW_OR_FAIL)) - return 0; - - // Size is greater than block size, need to allocate a new block and - // deallocate the old Avoid hysteresis by overallocating if increase is small - // (below 37%) - size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); - size_t new_size = - (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); - void *block = _rpmalloc_allocate(heap, new_size); - if (p && block) { - if (!(flags & RPMALLOC_NO_PRESERVE)) - memcpy(block, p, oldsize < new_size ? oldsize : new_size); - _rpmalloc_deallocate(p); - } - - return block; -} - -static void *_rpmalloc_aligned_reallocate(heap_t *heap, void *ptr, - size_t alignment, size_t size, - size_t oldsize, unsigned int flags) { - if (alignment <= SMALL_GRANULARITY) - return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags); - - int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL); - size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0); - if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) { - if (no_alloc || (size >= (usablesize / 2))) - return ptr; - } - // Aligned alloc marks span as having aligned blocks - void *block = - (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0); - if (EXPECTED(block != 0)) { - if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) { - if (!oldsize) - oldsize = usablesize; - memcpy(block, ptr, oldsize < size ? oldsize : size); - } - _rpmalloc_deallocate(ptr); - } - return block; -} - -//////////// -/// -/// Initialization, finalization and utility -/// -////// - -//! Get the usable size of the given block -static size_t _rpmalloc_usable_size(void *p) { - // Grab the span using guaranteed span alignment - span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); - if (span->size_class < SIZE_CLASS_COUNT) { - // Small/medium block - void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - return span->block_size - - ((size_t)pointer_diff(p, blocks_start) % span->block_size); - } - if (span->size_class == SIZE_CLASS_LARGE) { - // Large block - size_t current_spans = span->span_count; - return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); - } - // Oversized block, page count is stored in span_count - size_t current_pages = span->span_count; - return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); -} - -//! Adjust and optimize the size class properties for the given class -static void _rpmalloc_adjust_size_class(size_t iclass) { - size_t block_size = _memory_size_class[iclass].block_size; - size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; - - _memory_size_class[iclass].block_count = (uint16_t)block_count; - _memory_size_class[iclass].class_idx = (uint16_t)iclass; - - // Check if previous size classes can be merged - if (iclass >= SMALL_CLASS_COUNT) { - size_t prevclass = iclass; - while (prevclass > 0) { - --prevclass; - // A class can be merged if number of pages and number of blocks are equal - if (_memory_size_class[prevclass].block_count == - _memory_size_class[iclass].block_count) - _rpmalloc_memcpy_const(_memory_size_class + prevclass, - _memory_size_class + iclass, - sizeof(_memory_size_class[iclass])); - else - break; - } - } -} - -//! Initialize the allocator and setup global data -extern inline int rpmalloc_initialize(void) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - return rpmalloc_initialize_config(0); -} - -int rpmalloc_initialize_config(const rpmalloc_config_t *config) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - _rpmalloc_initialized = 1; - - if (config) - memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); - else - _rpmalloc_memset_const(&_memory_config, 0, sizeof(rpmalloc_config_t)); - - if (!_memory_config.memory_map || !_memory_config.memory_unmap) { - _memory_config.memory_map = _rpmalloc_mmap_os; - _memory_config.memory_unmap = _rpmalloc_unmap_os; - } - -#if PLATFORM_WINDOWS - SYSTEM_INFO system_info; - memset(&system_info, 0, sizeof(system_info)); - GetSystemInfo(&system_info); - _memory_map_granularity = system_info.dwAllocationGranularity; -#else - _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE); -#endif - -#if RPMALLOC_CONFIGURABLE - _memory_page_size = _memory_config.page_size; -#else - _memory_page_size = 0; -#endif - _memory_huge_pages = 0; - if (!_memory_page_size) { -#if PLATFORM_WINDOWS - _memory_page_size = system_info.dwPageSize; -#else - _memory_page_size = _memory_map_granularity; - if (_memory_config.enable_huge_pages) { -#if defined(__linux__) - size_t huge_page_size = 0; - FILE *meminfo = fopen("/proc/meminfo", "r"); - if (meminfo) { - char line[128]; - while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { - line[sizeof(line) - 1] = 0; - if (strstr(line, "Hugepagesize:")) - huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; - } - fclose(meminfo); - } - if (huge_page_size) { - _memory_huge_pages = 1; - _memory_page_size = huge_page_size; - _memory_map_granularity = huge_page_size; - } -#elif defined(__FreeBSD__) - int rc; - size_t sz = sizeof(rc); - - if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && - rc == 1) { - static size_t defsize = 2 * 1024 * 1024; - int nsize = 0; - size_t sizes[4] = {0}; - _memory_huge_pages = 1; - _memory_page_size = defsize; - if ((nsize = getpagesizes(sizes, 4)) >= 2) { - nsize--; - for (size_t csize = sizes[nsize]; nsize >= 0 && csize; - --nsize, csize = sizes[nsize]) { - //! Unlikely, but as a precaution.. - rpmalloc_assert(!(csize & (csize - 1)) && !(csize % 1024), - "Invalid page size"); - if (defsize < csize) { - _memory_page_size = csize; - break; - } - } - } - _memory_map_granularity = _memory_page_size; - } -#elif defined(__APPLE__) || defined(__NetBSD__) - _memory_huge_pages = 1; - _memory_page_size = 2 * 1024 * 1024; - _memory_map_granularity = _memory_page_size; -#endif - } -#endif - } else { - if (_memory_config.enable_huge_pages) - _memory_huge_pages = 1; - } - -#if PLATFORM_WINDOWS - if (_memory_config.enable_huge_pages) { - HANDLE token = 0; - size_t large_page_minimum = GetLargePageMinimum(); - if (large_page_minimum) - OpenProcessToken(GetCurrentProcess(), - TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); - if (token) { - LUID luid; - if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { - TOKEN_PRIVILEGES token_privileges; - memset(&token_privileges, 0, sizeof(token_privileges)); - token_privileges.PrivilegeCount = 1; - token_privileges.Privileges[0].Luid = luid; - token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { - if (GetLastError() == ERROR_SUCCESS) - _memory_huge_pages = 1; - } - } - CloseHandle(token); - } - if (_memory_huge_pages) { - if (large_page_minimum > _memory_page_size) - _memory_page_size = large_page_minimum; - if (large_page_minimum > _memory_map_granularity) - _memory_map_granularity = large_page_minimum; - } - } -#endif - - size_t min_span_size = 256; - size_t max_page_size; -#if UINTPTR_MAX > 0xFFFFFFFF - max_page_size = 4096ULL * 1024ULL * 1024ULL; -#else - max_page_size = 4 * 1024 * 1024; -#endif - if (_memory_page_size < min_span_size) - _memory_page_size = min_span_size; - if (_memory_page_size > max_page_size) - _memory_page_size = max_page_size; - _memory_page_size_shift = 0; - size_t page_size_bit = _memory_page_size; - while (page_size_bit != 1) { - ++_memory_page_size_shift; - page_size_bit >>= 1; - } - _memory_page_size = ((size_t)1 << _memory_page_size_shift); - -#if RPMALLOC_CONFIGURABLE - if (!_memory_config.span_size) { - _memory_span_size = _memory_default_span_size; - _memory_span_size_shift = _memory_default_span_size_shift; - _memory_span_mask = _memory_default_span_mask; - } else { - size_t span_size = _memory_config.span_size; - if (span_size > (256 * 1024)) - span_size = (256 * 1024); - _memory_span_size = 4096; - _memory_span_size_shift = 12; - while (_memory_span_size < span_size) { - _memory_span_size <<= 1; - ++_memory_span_size_shift; - } - _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); - } -#endif - - _memory_span_map_count = - (_memory_config.span_map_count ? _memory_config.span_map_count - : DEFAULT_SPAN_MAP_COUNT); - if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - if ((_memory_page_size >= _memory_span_size) && - ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) - ? DEFAULT_SPAN_MAP_COUNT - : _memory_span_map_count; - - _memory_config.page_size = _memory_page_size; - _memory_config.span_size = _memory_span_size; - _memory_config.span_map_count = _memory_span_map_count; - _memory_config.enable_huge_pages = _memory_huge_pages; - -#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ - defined(__TINYC__) - if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc)) - return -1; -#endif -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - fls_key = FlsAlloc(&_rpmalloc_thread_destructor); -#endif - - // Setup all small and medium size classes - size_t iclass = 0; - _memory_size_class[iclass].block_size = SMALL_GRANULARITY; - _rpmalloc_adjust_size_class(iclass); - for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { - size_t size = iclass * SMALL_GRANULARITY; - _memory_size_class[iclass].block_size = (uint32_t)size; - _rpmalloc_adjust_size_class(iclass); - } - // At least two blocks per span, then fall back to large allocations - _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; - if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) - _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; - for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { - size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); - if (size > _memory_medium_size_limit) { - _memory_medium_size_limit = - SMALL_SIZE_LIMIT + (iclass * MEDIUM_GRANULARITY); - break; - } - _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; - _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass); - } - - _memory_orphan_heaps = 0; -#if RPMALLOC_FIRST_CLASS_HEAPS - _memory_first_class_orphan_heaps = 0; -#endif -#if ENABLE_STATISTICS - atomic_store32(&_memory_active_heaps, 0); - atomic_store32(&_mapped_pages, 0); - _mapped_pages_peak = 0; - atomic_store32(&_master_spans, 0); - atomic_store32(&_mapped_total, 0); - atomic_store32(&_unmapped_total, 0); - atomic_store32(&_mapped_pages_os, 0); - atomic_store32(&_huge_pages_current, 0); - _huge_pages_peak = 0; -#endif - memset(_memory_heaps, 0, sizeof(_memory_heaps)); - atomic_store32_release(&_memory_global_lock, 0); - - rpmalloc_linker_reference(); - - // Initialize this thread - rpmalloc_thread_initialize(); - return 0; -} - -//! Finalize the allocator -void rpmalloc_finalize(void) { - rpmalloc_thread_finalize(1); - // rpmalloc_dump_statistics(stdout); - - if (_memory_global_reserve) { - atomic_add32(&_memory_global_reserve_master->remaining_spans, - -(int32_t)_memory_global_reserve_count); - _memory_global_reserve_master = 0; - _memory_global_reserve_count = 0; - _memory_global_reserve = 0; - } - atomic_store32_release(&_memory_global_lock, 0); - - // Free all thread caches and fully free spans - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t *heap = _memory_heaps[list_idx]; - while (heap) { - heap_t *next_heap = heap->next_heap; - heap->finalize = 1; - _rpmalloc_heap_global_finalize(heap); - heap = next_heap; - } - } - -#if ENABLE_GLOBAL_CACHE - // Free global caches - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) - _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]); -#endif - -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - pthread_key_delete(_memory_thread_heap); -#endif -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsFree(fls_key); - fls_key = 0; -#endif -#if ENABLE_STATISTICS - // If you hit these asserts you probably have memory leaks (perhaps global - // scope data doing dynamic allocations) or double frees in your code - rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected"); - rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, - "Memory leak detected"); -#endif - - _rpmalloc_initialized = 0; -} - -//! Initialize thread, assign heap -extern inline void rpmalloc_thread_initialize(void) { - if (!get_thread_heap_raw()) { - heap_t *heap = _rpmalloc_heap_allocate(0); - if (heap) { - _rpmalloc_stat_inc(&_memory_active_heaps); - set_thread_heap(heap); -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsSetValue(fls_key, heap); -#endif - } - } -} - -//! Finalize thread, orphan heap -void rpmalloc_thread_finalize(int release_caches) { - heap_t *heap = get_thread_heap_raw(); - if (heap) - _rpmalloc_heap_release_raw(heap, release_caches); - set_thread_heap(0); -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsSetValue(fls_key, 0); -#endif -} - -int rpmalloc_is_thread_initialized(void) { - return (get_thread_heap_raw() != 0) ? 1 : 0; -} - -const rpmalloc_config_t *rpmalloc_config(void) { return &_memory_config; } - -// Extern interface - -extern inline RPMALLOC_ALLOCATOR void *rpmalloc(size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - heap_t *heap = get_thread_heap(); - return _rpmalloc_allocate(heap, size); -} - -extern inline void rpfree(void *ptr) { _rpmalloc_deallocate(ptr); } - -extern inline RPMALLOC_ALLOCATOR void *rpcalloc(size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - heap_t *heap = get_thread_heap(); - void *block = _rpmalloc_allocate(heap, total); - if (block) - memset(block, 0, total); - return block; -} - -extern inline RPMALLOC_ALLOCATOR void *rprealloc(void *ptr, size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return ptr; - } -#endif - heap_t *heap = get_thread_heap(); - return _rpmalloc_reallocate(heap, ptr, size, 0, 0); -} - -extern RPMALLOC_ALLOCATOR void *rpaligned_realloc(void *ptr, size_t alignment, - size_t size, size_t oldsize, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if ((size + alignment < size) || (alignment > _memory_page_size)) { - errno = EINVAL; - return 0; - } -#endif - heap_t *heap = get_thread_heap(); - return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, - flags); -} - -extern RPMALLOC_ALLOCATOR void *rpaligned_alloc(size_t alignment, size_t size) { - heap_t *heap = get_thread_heap(); - return _rpmalloc_aligned_allocate(heap, alignment, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpaligned_calloc(size_t alignment, size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - void *block = rpaligned_alloc(alignment, total); - if (block) - memset(block, 0, total); - return block; -} - -extern inline RPMALLOC_ALLOCATOR void *rpmemalign(size_t alignment, - size_t size) { - return rpaligned_alloc(alignment, size); -} - -extern inline int rpposix_memalign(void **memptr, size_t alignment, - size_t size) { - if (memptr) - *memptr = rpaligned_alloc(alignment, size); - else - return EINVAL; - return *memptr ? 0 : ENOMEM; -} - -extern inline size_t rpmalloc_usable_size(void *ptr) { - return (ptr ? _rpmalloc_usable_size(ptr) : 0); -} - -extern inline void rpmalloc_thread_collect(void) {} - -void rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats) { - memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); - heap_t *heap = get_thread_heap_raw(); - if (!heap) - return; - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - size_class_t *size_class = _memory_size_class + iclass; - span_t *span = heap->size_class[iclass].partial_span; - while (span) { - size_t free_count = span->list_size; - size_t block_count = size_class->block_count; - if (span->free_list_limit < block_count) - block_count = span->free_list_limit; - free_count += (block_count - span->used_count); - stats->sizecache += free_count * size_class->block_size; - span = span->next; - } - } - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - stats->spancache += span_cache->count * (iclass + 1) * _memory_span_size; - } -#endif - - span_t *deferred = (span_t *)atomic_load_ptr(&heap->span_free_deferred); - while (deferred) { - if (deferred->size_class != SIZE_CLASS_HUGE) - stats->spancache += (size_t)deferred->span_count * _memory_span_size; - deferred = (span_t *)deferred->free_list; - } - -#if ENABLE_STATISTICS - stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global); - stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread); - - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - stats->span_use[iclass].current = - (size_t)atomic_load32(&heap->span_use[iclass].current); - stats->span_use[iclass].peak = - (size_t)atomic_load32(&heap->span_use[iclass].high); - stats->span_use[iclass].to_global = - (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global); - stats->span_use[iclass].from_global = - (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global); - stats->span_use[iclass].to_cache = - (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache); - stats->span_use[iclass].from_cache = - (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache); - stats->span_use[iclass].to_reserved = - (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved); - stats->span_use[iclass].from_reserved = - (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved); - stats->span_use[iclass].map_calls = - (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls); - } - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - stats->size_use[iclass].alloc_current = - (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); - stats->size_use[iclass].alloc_peak = - (size_t)heap->size_class_use[iclass].alloc_peak; - stats->size_use[iclass].alloc_total = - (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total); - stats->size_use[iclass].free_total = - (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); - stats->size_use[iclass].spans_to_cache = - (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache); - stats->size_use[iclass].spans_from_cache = - (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache); - stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32( - &heap->size_class_use[iclass].spans_from_reserved); - stats->size_use[iclass].map_calls = - (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls); - } -#endif -} - -void rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats) { - memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); -#if ENABLE_STATISTICS - stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; - stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; - stats->mapped_total = - (size_t)atomic_load32(&_mapped_total) * _memory_page_size; - stats->unmapped_total = - (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; - stats->huge_alloc = - (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; - stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; -#endif -#if ENABLE_GLOBAL_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - global_cache_t *cache = &_memory_span_cache[iclass]; - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - uint32_t count = cache->count; -#if ENABLE_UNLIMITED_CACHE - span_t *current_span = cache->overflow; - while (current_span) { - ++count; - current_span = current_span->next; - } -#endif - atomic_store32_release(&cache->lock, 0); - stats->cached += count * (iclass + 1) * _memory_span_size; - } -#endif -} - -#if ENABLE_STATISTICS - -static void _memory_heap_dump_statistics(heap_t *heap, void *file) { - fprintf(file, "Heap %d stats:\n", heap->id); - fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize " - "BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB " - "FromCacheMiB FromReserveMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) - continue; - fprintf( - file, - "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu " - "%9u\n", - (uint32_t)iclass, - atomic_load32(&heap->size_class_use[iclass].alloc_current), - heap->size_class_use[iclass].alloc_peak, - atomic_load32(&heap->size_class_use[iclass].alloc_total), - atomic_load32(&heap->size_class_use[iclass].free_total), - _memory_size_class[iclass].block_size, - _memory_size_class[iclass].block_count, - atomic_load32(&heap->size_class_use[iclass].spans_current), - heap->size_class_use[iclass].spans_peak, - ((size_t)heap->size_class_use[iclass].alloc_peak * - (size_t)_memory_size_class[iclass].block_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * - _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * - _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32( - &heap->size_class_use[iclass].spans_from_reserved) * - _memory_span_size) / - (size_t)(1024 * 1024), - atomic_load32(&heap->size_class_use[iclass].spans_map_calls)); - } - fprintf(file, "Spans Current Peak Deferred PeakMiB Cached ToCacheMiB " - "FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB " - "FromGlobalMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (!atomic_load32(&heap->span_use[iclass].high) && - !atomic_load32(&heap->span_use[iclass].spans_map_calls)) - continue; - fprintf( - file, - "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", - (uint32_t)(iclass + 1), atomic_load32(&heap->span_use[iclass].current), - atomic_load32(&heap->span_use[iclass].high), - atomic_load32(&heap->span_use[iclass].spans_deferred), - ((size_t)atomic_load32(&heap->span_use[iclass].high) * - (size_t)_memory_span_size * (iclass + 1)) / - (size_t)(1024 * 1024), -#if ENABLE_THREAD_CACHE - (unsigned int)(!iclass ? heap->span_cache.count - : heap->span_large_cache[iclass - 1].count), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), -#else - 0, (size_t)0, (size_t)0, -#endif - ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * - (size_t)_memory_span_size * (iclass + 1)) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * - (size_t)_memory_span_size * (iclass + 1)) / - (size_t)(1024 * 1024), - atomic_load32(&heap->span_use[iclass].spans_map_calls)); - } - fprintf(file, "Full spans: %zu\n", heap->full_span_count); - fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); - fprintf( - file, "%17zu %17zu\n", - (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), - (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024)); -} - -#endif - -void rpmalloc_dump_statistics(void *file) { -#if ENABLE_STATISTICS - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t *heap = _memory_heaps[list_idx]; - while (heap) { - int need_dump = 0; - for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); - ++iclass) { - if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) { - rpmalloc_assert( - !atomic_load32(&heap->size_class_use[iclass].free_total), - "Heap statistics counter mismatch"); - rpmalloc_assert( - !atomic_load32(&heap->size_class_use[iclass].spans_map_calls), - "Heap statistics counter mismatch"); - continue; - } - need_dump = 1; - } - for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); - ++iclass) { - if (!atomic_load32(&heap->span_use[iclass].high) && - !atomic_load32(&heap->span_use[iclass].spans_map_calls)) - continue; - need_dump = 1; - } - if (need_dump) - _memory_heap_dump_statistics(heap, file); - heap = heap->next_heap; - } - } - fprintf(file, "Global stats:\n"); - size_t huge_current = - (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; - size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; - fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); - fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), - huge_peak / (size_t)(1024 * 1024)); - -#if ENABLE_GLOBAL_CACHE - fprintf(file, "GlobalCacheMiB\n"); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - global_cache_t *cache = _memory_span_cache + iclass; - size_t global_cache = (size_t)cache->count * iclass * _memory_span_size; - - size_t global_overflow_cache = 0; - span_t *span = cache->overflow; - while (span) { - global_overflow_cache += iclass * _memory_span_size; - span = span->next; - } - if (global_cache || global_overflow_cache || cache->insert_count || - cache->extract_count) - fprintf(file, - "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", - iclass + 1, global_cache / (size_t)(1024 * 1024), - global_overflow_cache / (size_t)(1024 * 1024), - cache->insert_count, cache->extract_count); - } -#endif - - size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; - size_t mapped_os = - (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; - size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; - size_t mapped_total = - (size_t)atomic_load32(&_mapped_total) * _memory_page_size; - size_t unmapped_total = - (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; - fprintf( - file, - "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n"); - fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n", - mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024), - mapped_peak / (size_t)(1024 * 1024), - mapped_total / (size_t)(1024 * 1024), - unmapped_total / (size_t)(1024 * 1024)); - - fprintf(file, "\n"); -#if 0 - int64_t allocated = atomic_load64(&_allocation_counter); - int64_t deallocated = atomic_load64(&_deallocation_counter); - fprintf(file, "Allocation count: %lli\n", allocated); - fprintf(file, "Deallocation count: %lli\n", deallocated); - fprintf(file, "Current allocations: %lli\n", (allocated - deallocated)); - fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans)); - fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans)); -#endif -#endif - (void)sizeof(file); -} - -#if RPMALLOC_FIRST_CLASS_HEAPS - -extern inline rpmalloc_heap_t *rpmalloc_heap_acquire(void) { - // Must be a pristine heap from newly mapped memory pages, or else memory - // blocks could already be allocated from the heap which would (wrongly) be - // released when heap is cleared with rpmalloc_heap_free_all(). Also heaps - // guaranteed to be pristine from the dedicated orphan list can be used. - heap_t *heap = _rpmalloc_heap_allocate(1); - rpmalloc_assume(heap != NULL); - heap->owner_thread = 0; - _rpmalloc_stat_inc(&_memory_active_heaps); - return heap; -} - -extern inline void rpmalloc_heap_release(rpmalloc_heap_t *heap) { - if (heap) - _rpmalloc_heap_release(heap, 1, 1); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - return _rpmalloc_allocate(heap, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, - size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - return _rpmalloc_aligned_allocate(heap, alignment, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, size_t size) { - return rpmalloc_heap_aligned_calloc(heap, 0, num, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, - size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - void *block = _rpmalloc_aligned_allocate(heap, alignment, total); - if (block) - memset(block, 0, total); - return block; -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return ptr; - } -#endif - return _rpmalloc_reallocate(heap, ptr, size, 0, flags); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_realloc(rpmalloc_heap_t *heap, void *ptr, - size_t alignment, size_t size, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if ((size + alignment < size) || (alignment > _memory_page_size)) { - errno = EINVAL; - return 0; - } -#endif - return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags); -} - -extern inline void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr) { - (void)sizeof(heap); - _rpmalloc_deallocate(ptr); -} - -extern inline void rpmalloc_heap_free_all(rpmalloc_heap_t *heap) { - span_t *span; - span_t *next_span; - - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - span = heap->size_class[iclass].partial_span; - while (span) { - next_span = span->next; - _rpmalloc_heap_cache_insert(heap, span); - span = next_span; - } - heap->size_class[iclass].partial_span = 0; - span = heap->full_span[iclass]; - while (span) { - next_span = span->next; - _rpmalloc_heap_cache_insert(heap, span); - span = next_span; - } - - span = heap->size_class[iclass].cache; - if (span) - _rpmalloc_heap_cache_insert(heap, span); - heap->size_class[iclass].cache = 0; - } - memset(heap->size_class, 0, sizeof(heap->size_class)); - memset(heap->full_span, 0, sizeof(heap->full_span)); - - span = heap->large_huge_span; - while (span) { - next_span = span->next; - if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE)) - _rpmalloc_deallocate_huge(span); - else - _rpmalloc_heap_cache_insert(heap, span); - span = next_span; - } - heap->large_huge_span = 0; - heap->full_span_count = 0; - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - if (!span_cache->count) - continue; -#if ENABLE_GLOBAL_CACHE - _rpmalloc_stat_add64(&heap->thread_to_global, - span_cache->count * (iclass + 1) * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, - span_cache->count); - _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, - span_cache->count); -#else - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); -#endif - span_cache->count = 0; - } -#endif - -#if ENABLE_STATISTICS - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - atomic_store32(&heap->size_class_use[iclass].alloc_current, 0); - atomic_store32(&heap->size_class_use[iclass].spans_current, 0); - } - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - atomic_store32(&heap->span_use[iclass].current, 0); - } -#endif -} - -extern inline void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap) { - heap_t *prev_heap = get_thread_heap_raw(); - if (prev_heap != heap) { - set_thread_heap(heap); - if (prev_heap) - rpmalloc_heap_release(prev_heap); - } -} - -extern inline rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr) { - // Grab the span, and then the heap from the span - span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); - if (span) { - return span->heap; - } - return 0; -} - -#endif - -#if ENABLE_PRELOAD || ENABLE_OVERRIDE - -#include "malloc.c" - -#endif - -void rpmalloc_linker_reference(void) { (void)sizeof(_rpmalloc_initialized); } +//===---------------------- rpmalloc.c ------------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#include "rpmalloc.h" + +//////////// +/// +/// Build time configurable limits +/// +////// + +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wunused-macros" +#pragma clang diagnostic ignored "-Wunused-function" +#if __has_warning("-Wreserved-identifier") +#pragma clang diagnostic ignored "-Wreserved-identifier" +#endif +#if __has_warning("-Wstatic-in-inline") +#pragma clang diagnostic ignored "-Wstatic-in-inline" +#endif +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-macros" +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#if !defined(__has_builtin) +#define __has_builtin(b) 0 +#endif + +#if defined(__GNUC__) || defined(__clang__) + +#if __has_builtin(__builtin_memcpy_inline) +#define _rpmalloc_memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s) +#else +#define _rpmalloc_memcpy_const(x, y, s) \ + do { \ + _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ + "len must be a constant integer"); \ + memcpy(x, y, s); \ + } while (0) +#endif + +#if __has_builtin(__builtin_memset_inline) +#define _rpmalloc_memset_const(x, y, s) __builtin_memset_inline(x, y, s) +#else +#define _rpmalloc_memset_const(x, y, s) \ + do { \ + _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ + "len must be a constant integer"); \ + memset(x, y, s); \ + } while (0) +#endif +#else +#define _rpmalloc_memcpy_const(x, y, s) memcpy(x, y, s) +#define _rpmalloc_memset_const(x, y, s) memset(x, y, s) +#endif + +#if __has_builtin(__builtin_assume) +#define rpmalloc_assume(cond) __builtin_assume(cond) +#elif defined(__GNUC__) +#define rpmalloc_assume(cond) \ + do { \ + if (!__builtin_expect(cond, 0)) \ + __builtin_unreachable(); \ + } while (0) +#elif defined(_MSC_VER) +#define rpmalloc_assume(cond) __assume(cond) +#else +#define rpmalloc_assume(cond) 0 +#endif + +#ifndef HEAP_ARRAY_SIZE +//! Size of heap hashmap +#define HEAP_ARRAY_SIZE 47 +#endif +#ifndef ENABLE_THREAD_CACHE +//! Enable per-thread cache +#define ENABLE_THREAD_CACHE 1 +#endif +#ifndef ENABLE_GLOBAL_CACHE +//! Enable global cache shared between all threads, requires thread cache +#define ENABLE_GLOBAL_CACHE 1 +#endif +#ifndef ENABLE_VALIDATE_ARGS +//! Enable validation of args to public entry points +#define ENABLE_VALIDATE_ARGS 0 +#endif +#ifndef ENABLE_STATISTICS +//! Enable statistics collection +#define ENABLE_STATISTICS 0 +#endif +#ifndef ENABLE_ASSERTS +//! Enable asserts +#define ENABLE_ASSERTS 0 +#endif +#ifndef ENABLE_OVERRIDE +//! Override standard library malloc/free and new/delete entry points +#define ENABLE_OVERRIDE 0 +#endif +#ifndef ENABLE_PRELOAD +//! Support preloading +#define ENABLE_PRELOAD 0 +#endif +#ifndef DISABLE_UNMAP +//! Disable unmapping memory pages (also enables unlimited cache) +#define DISABLE_UNMAP 0 +#endif +#ifndef ENABLE_UNLIMITED_CACHE +//! Enable unlimited global cache (no unmapping until finalization) +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef ENABLE_ADAPTIVE_THREAD_CACHE +//! Enable adaptive thread cache size based on use heuristics +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif +#ifndef DEFAULT_SPAN_MAP_COUNT +//! Default number of spans to map in call to map more virtual memory (default +//! values yield 4MiB here) +#define DEFAULT_SPAN_MAP_COUNT 64 +#endif +#ifndef GLOBAL_CACHE_MULTIPLIER +//! Multiplier for global cache +#define GLOBAL_CACHE_MULTIPLIER 8 +#endif + +#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE +#error Must use global cache if unmap is disabled +#endif + +#if DISABLE_UNMAP +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 1 +#endif + +#if !ENABLE_GLOBAL_CACHE +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 0 +#endif + +#if !ENABLE_THREAD_CACHE +#undef ENABLE_ADAPTIVE_THREAD_CACHE +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif + +#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64) +#define PLATFORM_WINDOWS 1 +#define PLATFORM_POSIX 0 +#else +#define PLATFORM_WINDOWS 0 +#define PLATFORM_POSIX 1 +#endif + +/// Platform and arch specifics +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(disable : 5105) +#ifndef FORCEINLINE +#define FORCEINLINE inline __forceinline +#endif +#define _Static_assert static_assert +#else +#ifndef FORCEINLINE +#define FORCEINLINE inline __attribute__((__always_inline__)) +#endif +#endif +#if PLATFORM_WINDOWS +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#if ENABLE_VALIDATE_ARGS +#include +#endif +#else +#include +#include +#include +#include +#if defined(__linux__) || defined(__ANDROID__) +#include +#if !defined(PR_SET_VMA) +#define PR_SET_VMA 0x53564d41 +#define PR_SET_VMA_ANON_NAME 0 +#endif +#endif +#if defined(__APPLE__) +#include +#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR +#include +#include +#endif +#include +#endif +#if defined(__HAIKU__) || defined(__TINYC__) +#include +#endif +#endif + +#include +#include +#include + +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +#include +static DWORD fls_key; +#endif + +#if PLATFORM_POSIX +#include +#include +#ifdef __FreeBSD__ +#include +#define MAP_HUGETLB MAP_ALIGNED_SUPER +#ifndef PROT_MAX +#define PROT_MAX(f) 0 +#endif +#else +#define PROT_MAX(f) 0 +#endif +#ifdef __sun +extern int madvise(caddr_t, size_t, int); +#endif +#ifndef MAP_UNINITIALIZED +#define MAP_UNINITIALIZED 0 +#endif +#endif +#include + +#if ENABLE_ASSERTS +#undef NDEBUG +#if defined(_MSC_VER) && !defined(_DEBUG) +#define _DEBUG +#endif +#include +#define RPMALLOC_TOSTRING_M(x) #x +#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x) +#define rpmalloc_assert(truth, message) \ + do { \ + if (!(truth)) { \ + if (_memory_config.error_callback) { \ + _memory_config.error_callback(message " (" RPMALLOC_TOSTRING( \ + truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \ + } else { \ + assert((truth) && message); \ + } \ + } \ + } while (0) +#else +#define rpmalloc_assert(truth, message) \ + do { \ + } while (0) +#endif +#if ENABLE_STATISTICS +#include +#endif + +////// +/// +/// Atomic access abstraction (since MSVC does not do C11 yet) +/// +////// + +#if defined(_MSC_VER) && !defined(__clang__) + +typedef volatile long atomic32_t; +typedef volatile long long atomic64_t; +typedef volatile void *atomicptr_t; + +static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { return *src; } +static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { + *dst = val; +} +static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { + return (int32_t)InterlockedIncrement(val); +} +static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { + return (int32_t)InterlockedDecrement(val); +} +static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { + return (int32_t)InterlockedExchangeAdd(val, add) + add; +} +static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, + int32_t ref) { + return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; +} +static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { + *dst = val; +} +static FORCEINLINE int64_t atomic_load64(atomic64_t *src) { return *src; } +static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { + return (int64_t)InterlockedExchangeAdd64(val, add) + add; +} +static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { + return (void *)*src; +} +static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { + *dst = val; +} +static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { + *dst = val; +} +static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, + void *val) { + return (void *)InterlockedExchangePointer((void *volatile *)dst, val); +} +static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { + return (InterlockedCompareExchangePointer((void *volatile *)dst, val, ref) == + ref) + ? 1 + : 0; +} + +#define EXPECTED(x) (x) +#define UNEXPECTED(x) (x) + +#else + +#include + +typedef volatile _Atomic(int32_t) atomic32_t; +typedef volatile _Atomic(int64_t) atomic64_t; +typedef volatile _Atomic(void *) atomicptr_t; + +static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { + return atomic_load_explicit(src, memory_order_relaxed); +} +static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { + atomic_store_explicit(dst, val, memory_order_relaxed); +} +static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { + return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; +} +static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { + return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; +} +static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { + return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; +} +static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, + int32_t ref) { + return atomic_compare_exchange_weak_explicit( + dst, &ref, val, memory_order_acquire, memory_order_relaxed); +} +static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { + atomic_store_explicit(dst, val, memory_order_release); +} +static FORCEINLINE int64_t atomic_load64(atomic64_t *val) { + return atomic_load_explicit(val, memory_order_relaxed); +} +static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { + return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; +} +static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { + return atomic_load_explicit(src, memory_order_relaxed); +} +static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { + atomic_store_explicit(dst, val, memory_order_relaxed); +} +static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { + atomic_store_explicit(dst, val, memory_order_release); +} +static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, + void *val) { + return atomic_exchange_explicit(dst, val, memory_order_acquire); +} +static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { + return atomic_compare_exchange_weak_explicit( + dst, &ref, val, memory_order_relaxed, memory_order_relaxed); +} + +#define EXPECTED(x) __builtin_expect((x), 1) +#define UNEXPECTED(x) __builtin_expect((x), 0) + +#endif + +//////////// +/// +/// Statistics related functions (evaluate to nothing when statistics not +/// enabled) +/// +////// + +#if ENABLE_STATISTICS +#define _rpmalloc_stat_inc(counter) atomic_incr32(counter) +#define _rpmalloc_stat_dec(counter) atomic_decr32(counter) +#define _rpmalloc_stat_add(counter, value) \ + atomic_add32(counter, (int32_t)(value)) +#define _rpmalloc_stat_add64(counter, value) \ + atomic_add64(counter, (int64_t)(value)) +#define _rpmalloc_stat_add_peak(counter, value, peak) \ + do { \ + int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); \ + if (_cur_count > (peak)) \ + peak = _cur_count; \ + } while (0) +#define _rpmalloc_stat_sub(counter, value) \ + atomic_add32(counter, -(int32_t)(value)) +#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ + do { \ + int32_t alloc_current = \ + atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ + if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ + heap->size_class_use[class_idx].alloc_peak = alloc_current; \ + atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \ + } while (0) +#define _rpmalloc_stat_inc_free(heap, class_idx) \ + do { \ + atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ + atomic_incr32(&heap->size_class_use[class_idx].free_total); \ + } while (0) +#else +#define _rpmalloc_stat_inc(counter) \ + do { \ + } while (0) +#define _rpmalloc_stat_dec(counter) \ + do { \ + } while (0) +#define _rpmalloc_stat_add(counter, value) \ + do { \ + } while (0) +#define _rpmalloc_stat_add64(counter, value) \ + do { \ + } while (0) +#define _rpmalloc_stat_add_peak(counter, value, peak) \ + do { \ + } while (0) +#define _rpmalloc_stat_sub(counter, value) \ + do { \ + } while (0) +#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ + do { \ + } while (0) +#define _rpmalloc_stat_inc_free(heap, class_idx) \ + do { \ + } while (0) +#endif + +/// +/// Preconfigured limits and sizes +/// + +//! Granularity of a small allocation block (must be power of two) +#define SMALL_GRANULARITY 16 +//! Small granularity shift count +#define SMALL_GRANULARITY_SHIFT 4 +//! Number of small block size classes +#define SMALL_CLASS_COUNT 65 +//! Maximum size of a small block +#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1)) +//! Granularity of a medium allocation block +#define MEDIUM_GRANULARITY 512 +//! Medium granularity shift count +#define MEDIUM_GRANULARITY_SHIFT 9 +//! Number of medium block size classes +#define MEDIUM_CLASS_COUNT 61 +//! Total number of small + medium size classes +#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) +//! Number of large block size classes +#define LARGE_CLASS_COUNT 63 +//! Maximum size of a medium block +#define MEDIUM_SIZE_LIMIT \ + (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) +//! Maximum size of a large block +#define LARGE_SIZE_LIMIT \ + ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) +//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power +//! of two) +#define SPAN_HEADER_SIZE 128 +//! Number of spans in thread cache +#define MAX_THREAD_SPAN_CACHE 400 +//! Number of spans to transfer between thread and global cache +#define THREAD_SPAN_CACHE_TRANSFER 64 +//! Number of spans in thread cache for large spans (must be greater than +//! LARGE_CLASS_COUNT / 2) +#define MAX_THREAD_SPAN_LARGE_CACHE 100 +//! Number of spans to transfer between thread and global cache for large spans +#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6 + +_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, + "Small granularity must be power of two"); +_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, + "Span header size must be power of two"); + +#if ENABLE_VALIDATE_ARGS +//! Maximum allocation size to avoid integer overflow +#undef MAX_ALLOC_SIZE +#define MAX_ALLOC_SIZE (((size_t) - 1) - _memory_span_size) +#endif + +#define pointer_offset(ptr, ofs) (void *)((char *)(ptr) + (ptrdiff_t)(ofs)) +#define pointer_diff(first, second) \ + (ptrdiff_t)((const char *)(first) - (const char *)(second)) + +#define INVALID_POINTER ((void *)((uintptr_t) - 1)) + +#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT +#define SIZE_CLASS_HUGE ((uint32_t) - 1) + +//////////// +/// +/// Data types +/// +////// + +//! A memory heap, per thread +typedef struct heap_t heap_t; +//! Span of memory pages +typedef struct span_t span_t; +//! Span list +typedef struct span_list_t span_list_t; +//! Span active data +typedef struct span_active_t span_active_t; +//! Size class definition +typedef struct size_class_t size_class_t; +//! Global cache +typedef struct global_cache_t global_cache_t; + +//! Flag indicating span is the first (master) span of a split superspan +#define SPAN_FLAG_MASTER 1U +//! Flag indicating span is a secondary (sub) span of a split superspan +#define SPAN_FLAG_SUBSPAN 2U +//! Flag indicating span has blocks with increased alignment +#define SPAN_FLAG_ALIGNED_BLOCKS 4U +//! Flag indicating an unmapped master span +#define SPAN_FLAG_UNMAPPED_MASTER 8U + +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS +struct span_use_t { + //! Current number of spans used (actually used, not in cache) + atomic32_t current; + //! High water mark of spans used + atomic32_t high; +#if ENABLE_STATISTICS + //! Number of spans in deferred list + atomic32_t spans_deferred; + //! Number of spans transitioned to global cache + atomic32_t spans_to_global; + //! Number of spans transitioned from global cache + atomic32_t spans_from_global; + //! Number of spans transitioned to thread cache + atomic32_t spans_to_cache; + //! Number of spans transitioned from thread cache + atomic32_t spans_from_cache; + //! Number of spans transitioned to reserved state + atomic32_t spans_to_reserved; + //! Number of spans transitioned from reserved state + atomic32_t spans_from_reserved; + //! Number of raw memory map calls + atomic32_t spans_map_calls; +#endif +}; +typedef struct span_use_t span_use_t; +#endif + +#if ENABLE_STATISTICS +struct size_class_use_t { + //! Current number of allocations + atomic32_t alloc_current; + //! Peak number of allocations + int32_t alloc_peak; + //! Total number of allocations + atomic32_t alloc_total; + //! Total number of frees + atomic32_t free_total; + //! Number of spans in use + atomic32_t spans_current; + //! Number of spans transitioned to cache + int32_t spans_peak; + //! Number of spans transitioned to cache + atomic32_t spans_to_cache; + //! Number of spans transitioned from cache + atomic32_t spans_from_cache; + //! Number of spans transitioned from reserved state + atomic32_t spans_from_reserved; + //! Number of spans mapped + atomic32_t spans_map_calls; + int32_t unused; +}; +typedef struct size_class_use_t size_class_use_t; +#endif + +// A span can either represent a single span of memory pages with size declared +// by span_map_count configuration variable, or a set of spans in a continuous +// region, a super span. Any reference to the term "span" usually refers to both +// a single span or a super span. A super span can further be divided into +// multiple spans (or this, super spans), where the first (super)span is the +// master and subsequent (super)spans are subspans. The master span keeps track +// of how many subspans that are still alive and mapped in virtual memory, and +// once all subspans and master have been unmapped the entire superspan region +// is released and unmapped (on Windows for example, the entire superspan range +// has to be released in the same call to release the virtual memory range, but +// individual subranges can be decommitted individually to reduce physical +// memory use). +struct span_t { + //! Free list + void *free_list; + //! Total block count of size class + uint32_t block_count; + //! Size class + uint32_t size_class; + //! Index of last block initialized in free list + uint32_t free_list_limit; + //! Number of used blocks remaining when in partial state + uint32_t used_count; + //! Deferred free list + atomicptr_t free_list_deferred; + //! Size of deferred free list, or list of spans when part of a cache list + uint32_t list_size; + //! Size of a block + uint32_t block_size; + //! Flags and counters + uint32_t flags; + //! Number of spans + uint32_t span_count; + //! Total span counter for master spans + uint32_t total_spans; + //! Offset from master span for subspans + uint32_t offset_from_master; + //! Remaining span counter, for master spans + atomic32_t remaining_spans; + //! Alignment offset + uint32_t align_offset; + //! Owning heap + heap_t *heap; + //! Next span + span_t *next; + //! Previous span + span_t *prev; +}; +_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); + +struct span_cache_t { + size_t count; + span_t *span[MAX_THREAD_SPAN_CACHE]; +}; +typedef struct span_cache_t span_cache_t; + +struct span_large_cache_t { + size_t count; + span_t *span[MAX_THREAD_SPAN_LARGE_CACHE]; +}; +typedef struct span_large_cache_t span_large_cache_t; + +struct heap_size_class_t { + //! Free list of active span + void *free_list; + //! Double linked list of partially used spans with free blocks. + // Previous span pointer in head points to tail span of list. + span_t *partial_span; + //! Early level cache of fully free spans + span_t *cache; +}; +typedef struct heap_size_class_t heap_size_class_t; + +// Control structure for a heap, either a thread heap or a first class heap if +// enabled +struct heap_t { + //! Owning thread ID + uintptr_t owner_thread; + //! Free lists for each size class + heap_size_class_t size_class[SIZE_CLASS_COUNT]; +#if ENABLE_THREAD_CACHE + //! Arrays of fully freed spans, single span + span_cache_t span_cache; +#endif + //! List of deferred free spans (single linked list) + atomicptr_t span_free_deferred; + //! Number of full spans + size_t full_span_count; + //! Mapped but unused spans + span_t *span_reserve; + //! Master span for mapped but unused spans + span_t *span_reserve_master; + //! Number of mapped but unused spans + uint32_t spans_reserved; + //! Child count + atomic32_t child_count; + //! Next heap in id list + heap_t *next_heap; + //! Next heap in orphan list + heap_t *next_orphan; + //! Heap ID + int32_t id; + //! Finalization state flag + int finalize; + //! Master heap owning the memory pages + heap_t *master_heap; +#if ENABLE_THREAD_CACHE + //! Arrays of fully freed spans, large spans with > 1 span count + span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1]; +#endif +#if RPMALLOC_FIRST_CLASS_HEAPS + //! Double linked list of fully utilized spans with free blocks for each size + //! class. + // Previous span pointer in head points to tail span of list. + span_t *full_span[SIZE_CLASS_COUNT]; + //! Double linked list of large and huge spans allocated by this heap + span_t *large_huge_span; +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //! Current and high water mark of spans used per span count + span_use_t span_use[LARGE_CLASS_COUNT]; +#endif +#if ENABLE_STATISTICS + //! Allocation stats per size class + size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; + //! Number of bytes transitioned thread -> global + atomic64_t thread_to_global; + //! Number of bytes transitioned global -> thread + atomic64_t global_to_thread; +#endif +}; + +// Size class for defining a block size bucket +struct size_class_t { + //! Size of blocks in this class + uint32_t block_size; + //! Number of blocks in each chunk + uint16_t block_count; + //! Class index this class is merged with + uint16_t class_idx; +}; +_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); + +struct global_cache_t { + //! Cache lock + atomic32_t lock; + //! Cache count + uint32_t count; +#if ENABLE_STATISTICS + //! Insert count + size_t insert_count; + //! Extract count + size_t extract_count; +#endif + //! Cached spans + span_t *span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE]; + //! Unlimited cache overflow + span_t *overflow; +}; + +//////////// +/// +/// Global data +/// +////// + +//! Default span size (64KiB) +#define _memory_default_span_size (64 * 1024) +#define _memory_default_span_size_shift 16 +#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1))) + +//! Initialized flag +static int _rpmalloc_initialized; +//! Main thread ID +static uintptr_t _rpmalloc_main_thread_id; +//! Configuration +static rpmalloc_config_t _memory_config; +//! Memory page size +static size_t _memory_page_size; +//! Shift to divide by page size +static size_t _memory_page_size_shift; +//! Granularity at which memory pages are mapped by OS +static size_t _memory_map_granularity; +#if RPMALLOC_CONFIGURABLE +//! Size of a span of memory pages +static size_t _memory_span_size; +//! Shift to divide by span size +static size_t _memory_span_size_shift; +//! Mask to get to start of a memory span +static uintptr_t _memory_span_mask; +#else +//! Hardwired span size +#define _memory_span_size _memory_default_span_size +#define _memory_span_size_shift _memory_default_span_size_shift +#define _memory_span_mask _memory_default_span_mask +#endif +//! Number of spans to map in each map call +static size_t _memory_span_map_count; +//! Number of spans to keep reserved in each heap +static size_t _memory_heap_reserve_count; +//! Global size classes +static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; +//! Run-time size limit of medium blocks +static size_t _memory_medium_size_limit; +//! Heap ID counter +static atomic32_t _memory_heap_id; +//! Huge page support +static int _memory_huge_pages; +#if ENABLE_GLOBAL_CACHE +//! Global span cache +static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; +#endif +//! Global reserved spans +static span_t *_memory_global_reserve; +//! Global reserved count +static size_t _memory_global_reserve_count; +//! Global reserved master +static span_t *_memory_global_reserve_master; +//! All heaps +static heap_t *_memory_heaps[HEAP_ARRAY_SIZE]; +//! Used to restrict access to mapping memory for huge pages +static atomic32_t _memory_global_lock; +//! Orphaned heaps +static heap_t *_memory_orphan_heaps; +#if RPMALLOC_FIRST_CLASS_HEAPS +//! Orphaned heaps (first class heaps) +static heap_t *_memory_first_class_orphan_heaps; +#endif +#if ENABLE_STATISTICS +//! Allocations counter +static atomic64_t _allocation_counter; +//! Deallocations counter +static atomic64_t _deallocation_counter; +//! Active heap count +static atomic32_t _memory_active_heaps; +//! Number of currently mapped memory pages +static atomic32_t _mapped_pages; +//! Peak number of concurrently mapped memory pages +static int32_t _mapped_pages_peak; +//! Number of mapped master spans +static atomic32_t _master_spans; +//! Number of unmapped dangling master spans +static atomic32_t _unmapped_master_spans; +//! Running counter of total number of mapped memory pages since start +static atomic32_t _mapped_total; +//! Running counter of total number of unmapped memory pages since start +static atomic32_t _unmapped_total; +//! Number of currently mapped memory pages in OS calls +static atomic32_t _mapped_pages_os; +//! Number of currently allocated pages in huge allocations +static atomic32_t _huge_pages_current; +//! Peak number of currently allocated pages in huge allocations +static int32_t _huge_pages_peak; +#endif + +//////////// +/// +/// Thread local heap and ID +/// +////// + +//! Current thread heap +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ + defined(__TINYC__) +static pthread_key_t _memory_thread_heap; +#else +#ifdef _MSC_VER +#define _Thread_local __declspec(thread) +#define TLS_MODEL +#else +#ifndef __HAIKU__ +#define TLS_MODEL __attribute__((tls_model("initial-exec"))) +#else +#define TLS_MODEL +#endif +#if !defined(__clang__) && defined(__GNUC__) +#define _Thread_local __thread +#endif +#endif +static _Thread_local heap_t *_memory_thread_heap TLS_MODEL; +#endif + +static inline heap_t *get_thread_heap_raw(void) { +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + return pthread_getspecific(_memory_thread_heap); +#else + return _memory_thread_heap; +#endif +} + +//! Get the current thread heap +static inline heap_t *get_thread_heap(void) { + heap_t *heap = get_thread_heap_raw(); +#if ENABLE_PRELOAD + if (EXPECTED(heap != 0)) + return heap; + rpmalloc_initialize(); + return get_thread_heap_raw(); +#else + return heap; +#endif +} + +//! Fast thread ID +static inline uintptr_t get_thread_id(void) { +#if defined(_WIN32) + return (uintptr_t)((void *)NtCurrentTeb()); +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) + uintptr_t tid; +#if defined(__i386__) + __asm__("movl %%gs:0, %0" : "=r"(tid) : :); +#elif defined(__x86_64__) +#if defined(__MACH__) + __asm__("movq %%gs:0, %0" : "=r"(tid) : :); +#else + __asm__("movq %%fs:0, %0" : "=r"(tid) : :); +#endif +#elif defined(__arm__) + __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid)); +#elif defined(__aarch64__) +#if defined(__MACH__) + // tpidr_el0 likely unused, always return 0 on iOS + __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid)); +#else + __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid)); +#endif +#else +#error This platform needs implementation of get_thread_id() +#endif + return tid; +#else +#error This platform needs implementation of get_thread_id() +#endif +} + +//! Set the current thread heap +static void set_thread_heap(heap_t *heap) { +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ + defined(__TINYC__) + pthread_setspecific(_memory_thread_heap, heap); +#else + _memory_thread_heap = heap; +#endif + if (heap) + heap->owner_thread = get_thread_id(); +} + +//! Set main thread ID +extern void rpmalloc_set_main_thread(void); + +void rpmalloc_set_main_thread(void) { + _rpmalloc_main_thread_id = get_thread_id(); +} + +static void _rpmalloc_spin(void) { +#if defined(_MSC_VER) +#if defined(_M_ARM64) + __yield(); +#else + _mm_pause(); +#endif +#elif defined(__x86_64__) || defined(__i386__) + __asm__ volatile("pause" ::: "memory"); +#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7) + __asm__ volatile("yield" ::: "memory"); +#elif defined(__powerpc__) || defined(__powerpc64__) + // No idea if ever been compiled in such archs but ... as precaution + __asm__ volatile("or 27,27,27"); +#elif defined(__sparc__) + __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0"); +#else + struct timespec ts = {0}; + nanosleep(&ts, 0); +#endif +} + +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +static void NTAPI _rpmalloc_thread_destructor(void *value) { +#if ENABLE_OVERRIDE + // If this is called on main thread it means rpmalloc_finalize + // has not been called and shutdown is forced (through _exit) or unclean + if (get_thread_id() == _rpmalloc_main_thread_id) + return; +#endif + if (value) + rpmalloc_thread_finalize(1); +} +#endif + +//////////// +/// +/// Low level memory map/unmap +/// +////// + +static void _rpmalloc_set_name(void *address, size_t size) { +#if defined(__linux__) || defined(__ANDROID__) + const char *name = _memory_huge_pages ? _memory_config.huge_page_name + : _memory_config.page_name; + if (address == MAP_FAILED || !name) + return; + // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails + // (e.g. invalid name) it is a no-op basically. + (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, + (uintptr_t)name); +#else + (void)sizeof(size); + (void)sizeof(address); +#endif +} + +//! Map more virtual memory +// size is number of bytes to map +// offset receives the offset in bytes from start of mapped region +// returns address to start of mapped region to use +static void *_rpmalloc_mmap(size_t size, size_t *offset) { + rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); + void *address = _memory_config.memory_map(size, offset); + if (EXPECTED(address != 0)) { + _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), + _mapped_pages_peak); + _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift)); + } + return address; +} + +//! Unmap virtual memory +// address is the memory address to unmap, as returned from _memory_map +// size is the number of bytes to unmap, which might be less than full region +// for a partial unmap offset is the offset in bytes to the actual mapped +// region, as set by _memory_map release is set to 0 for partial unmap, or size +// of entire range for a full unmap +static void _rpmalloc_unmap(void *address, size_t size, size_t offset, + size_t release) { + rpmalloc_assert(!release || (release >= size), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), + "Invalid unmap size"); + if (release) { + rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size"); + _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift)); + _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift)); + } + _memory_config.memory_unmap(address, size, offset, release); +} + +//! Default implementation to map new pages to virtual memory +static void *_rpmalloc_mmap_os(size_t size, size_t *offset) { + // Either size is a heap (a single page) or a (multiple) span - we only need + // to align spans, and only if larger than map granularity + size_t padding = ((size >= _memory_span_size) && + (_memory_span_size > _memory_map_granularity)) + ? _memory_span_size + : 0; + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); +#if PLATFORM_WINDOWS + // Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not + // allocated unless/until the virtual addresses are actually accessed" + void *ptr = VirtualAlloc(0, size + padding, + (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + if (!ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else { + rpmalloc_assert(ptr, "Failed to map virtual memory block"); + } + return 0; + } +#else + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; +#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR + int fd = (int)VM_MAKE_TAG(240U); + if (_memory_huge_pages) + fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; + void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); +#elif defined(MAP_HUGETLB) + void *ptr = mmap(0, size + padding, + PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), + (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); +#if defined(MADV_HUGEPAGE) + // In some configurations, huge pages allocations might fail thus + // we fallback to normal allocations and promote the region as transparent + // huge page + if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) { + ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); + if (ptr && ptr != MAP_FAILED) { + int prm = madvise(ptr, size + padding, MADV_HUGEPAGE); + (void)prm; + rpmalloc_assert((prm == 0), "Failed to promote the page to THP"); + } + } +#endif + _rpmalloc_set_name(ptr, size + padding); +#elif defined(MAP_ALIGNED) + const size_t align = + (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1)); + void *ptr = + mmap(0, size + padding, PROT_READ | PROT_WRITE, + (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0); +#elif defined(MAP_ALIGN) + caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0); + void *ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, + (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0); +#else + void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if ((ptr == MAP_FAILED) || !ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else if (errno != ENOMEM) { + rpmalloc_assert((ptr != MAP_FAILED) && ptr, + "Failed to map virtual memory block"); + } + return 0; + } +#endif + _rpmalloc_stat_add(&_mapped_pages_os, + (int32_t)((size + padding) >> _memory_page_size_shift)); + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); + rpmalloc_assert(final_padding <= _memory_span_size, + "Internal failure in padding"); + rpmalloc_assert(final_padding <= padding, "Internal failure in padding"); + rpmalloc_assert(!(final_padding % 8), "Internal failure in padding"); + ptr = pointer_offset(ptr, final_padding); + *offset = final_padding >> 3; + } + rpmalloc_assert((size < _memory_span_size) || + !((uintptr_t)ptr & ~_memory_span_mask), + "Internal failure in padding"); + return ptr; +} + +//! Default implementation to unmap pages from virtual memory +static void _rpmalloc_unmap_os(void *address, size_t size, size_t offset, + size_t release) { + rpmalloc_assert(release || (offset == 0), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), + "Invalid unmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size"); + if (release && offset) { + offset <<= 3; + address = pointer_offset(address, -(int32_t)offset); + if ((release >= _memory_span_size) && + (_memory_span_size > _memory_map_granularity)) { + // Padding is always one span size + release += _memory_span_size; + } + } +#if !DISABLE_UNMAP +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, + release ? MEM_RELEASE : MEM_DECOMMIT)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } +#else + if (release) { + if (munmap(address, release)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } + } else { +#if defined(MADV_FREE_REUSABLE) + int ret; + while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && + (errno == EAGAIN)) + errno = 0; + if ((ret == -1) && (errno != 0)) { +#elif defined(MADV_DONTNEED) + if (madvise(address, size, MADV_DONTNEED)) { +#elif defined(MADV_PAGEOUT) + if (madvise(address, size, MADV_PAGEOUT)) { +#elif defined(MADV_FREE) + if (madvise(address, size, MADV_FREE)) { +#else + if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { +#endif + rpmalloc_assert(0, "Failed to madvise virtual memory block as free"); + } + } +#endif +#endif + if (release) + _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift); +} + +static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, + span_t *subspan, + size_t span_count); + +//! Use global reserved spans to fulfill a memory map request (reserve size must +//! be checked by caller) +static span_t *_rpmalloc_global_get_reserved_spans(size_t span_count) { + span_t *span = _memory_global_reserve; + _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, + span, span_count); + _memory_global_reserve_count -= span_count; + if (_memory_global_reserve_count) + _memory_global_reserve = + (span_t *)pointer_offset(span, span_count << _memory_span_size_shift); + else + _memory_global_reserve = 0; + return span; +} + +//! Store the given spans as global reserve (must only be called from within new +//! heap allocation, not thread safe) +static void _rpmalloc_global_set_reserved_spans(span_t *master, span_t *reserve, + size_t reserve_span_count) { + _memory_global_reserve_master = master; + _memory_global_reserve_count = reserve_span_count; + _memory_global_reserve = reserve; +} + +//////////// +/// +/// Span linked list management +/// +////// + +//! Add a span to double linked list at the head +static void _rpmalloc_span_double_link_list_add(span_t **head, span_t *span) { + if (*head) + (*head)->prev = span; + span->next = *head; + *head = span; +} + +//! Pop head span from double linked list +static void _rpmalloc_span_double_link_list_pop_head(span_t **head, + span_t *span) { + rpmalloc_assert(*head == span, "Linked list corrupted"); + span = *head; + *head = span->next; +} + +//! Remove a span from double linked list +static void _rpmalloc_span_double_link_list_remove(span_t **head, + span_t *span) { + rpmalloc_assert(*head, "Linked list corrupted"); + if (*head == span) { + *head = span->next; + } else { + span_t *next_span = span->next; + span_t *prev_span = span->prev; + prev_span->next = next_span; + if (EXPECTED(next_span != 0)) + next_span->prev = prev_span; + } +} + +//////////// +/// +/// Span control +/// +////// + +static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span); + +static void _rpmalloc_heap_finalize(heap_t *heap); + +static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, + span_t *reserve, + size_t reserve_span_count); + +//! Declare the span to be a subspan and store distance from master span and +//! span count +static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, + span_t *subspan, + size_t span_count) { + rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), + "Span master pointer and/or flag mismatch"); + if (subspan != master) { + subspan->flags = SPAN_FLAG_SUBSPAN; + subspan->offset_from_master = + (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> + _memory_span_size_shift); + subspan->align_offset = 0; + } + subspan->span_count = (uint32_t)span_count; +} + +//! Use reserved spans to fulfill a memory map request (reserve size must be +//! checked by caller) +static span_t *_rpmalloc_span_map_from_reserve(heap_t *heap, + size_t span_count) { + // Update the heap span reserve + span_t *span = heap->span_reserve; + heap->span_reserve = + (span_t *)pointer_offset(span, span_count * _memory_span_size); + heap->spans_reserved -= (uint32_t)span_count; + + _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, + span_count); + if (span_count <= LARGE_CLASS_COUNT) + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved); + + return span; +} + +//! Get the aligned number of spans to map in based on wanted count, configured +//! mapping granularity and the page size +static size_t _rpmalloc_span_align_count(size_t span_count) { + size_t request_count = (span_count > _memory_span_map_count) + ? span_count + : _memory_span_map_count; + if ((_memory_page_size > _memory_span_size) && + ((request_count * _memory_span_size) % _memory_page_size)) + request_count += + _memory_span_map_count - (request_count % _memory_span_map_count); + return request_count; +} + +//! Setup a newly mapped span +static void _rpmalloc_span_initialize(span_t *span, size_t total_span_count, + size_t span_count, size_t align_offset) { + span->total_spans = (uint32_t)total_span_count; + span->span_count = (uint32_t)span_count; + span->align_offset = (uint32_t)align_offset; + span->flags = SPAN_FLAG_MASTER; + atomic_store32(&span->remaining_spans, (int32_t)total_span_count); +} + +static void _rpmalloc_span_unmap(span_t *span); + +//! Map an aligned set of spans, taking configured mapping granularity and the +//! page size into account +static span_t *_rpmalloc_span_map_aligned_count(heap_t *heap, + size_t span_count) { + // If we already have some, but not enough, reserved spans, release those to + // heap cache and map a new full set of spans. Otherwise we would waste memory + // if page size > span size (huge pages) + size_t aligned_span_count = _rpmalloc_span_align_count(span_count); + size_t align_offset = 0; + span_t *span = (span_t *)_rpmalloc_mmap( + aligned_span_count * _memory_span_size, &align_offset); + if (!span) + return 0; + _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset); + _rpmalloc_stat_inc(&_master_spans); + if (span_count <= LARGE_CLASS_COUNT) + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls); + if (aligned_span_count > span_count) { + span_t *reserved_spans = + (span_t *)pointer_offset(span, span_count * _memory_span_size); + size_t reserved_count = aligned_span_count - span_count; + if (heap->spans_reserved) { + _rpmalloc_span_mark_as_subspan_unless_master( + heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); + _rpmalloc_heap_cache_insert(heap, heap->span_reserve); + } + if (reserved_count > _memory_heap_reserve_count) { + // If huge pages or eager spam map count, the global reserve spin lock is + // held by caller, _rpmalloc_span_map + rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, + "Global spin lock not held as expected"); + size_t remain_count = reserved_count - _memory_heap_reserve_count; + reserved_count = _memory_heap_reserve_count; + span_t *remain_span = (span_t *)pointer_offset( + reserved_spans, reserved_count * _memory_span_size); + if (_memory_global_reserve) { + _rpmalloc_span_mark_as_subspan_unless_master( + _memory_global_reserve_master, _memory_global_reserve, + _memory_global_reserve_count); + _rpmalloc_span_unmap(_memory_global_reserve); + } + _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count); + } + _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, + reserved_count); + } + return span; +} + +//! Map in memory pages for the given number of spans (or use previously +//! reserved pages) +static span_t *_rpmalloc_span_map(heap_t *heap, size_t span_count) { + if (span_count <= heap->spans_reserved) + return _rpmalloc_span_map_from_reserve(heap, span_count); + span_t *span = 0; + int use_global_reserve = + (_memory_page_size > _memory_span_size) || + (_memory_span_map_count > _memory_heap_reserve_count); + if (use_global_reserve) { + // If huge pages, make sure only one thread maps more memory to avoid bloat + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (_memory_global_reserve_count >= span_count) { + size_t reserve_count = + (!heap->spans_reserved ? _memory_heap_reserve_count : span_count); + if (_memory_global_reserve_count < reserve_count) + reserve_count = _memory_global_reserve_count; + span = _rpmalloc_global_get_reserved_spans(reserve_count); + if (span) { + if (reserve_count > span_count) { + span_t *reserved_span = (span_t *)pointer_offset( + span, span_count << _memory_span_size_shift); + _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, + reserved_span, + reserve_count - span_count); + } + // Already marked as subspan in _rpmalloc_global_get_reserved_spans + span->span_count = (uint32_t)span_count; + } + } + } + if (!span) + span = _rpmalloc_span_map_aligned_count(heap, span_count); + if (use_global_reserve) + atomic_store32_release(&_memory_global_lock, 0); + return span; +} + +//! Unmap memory pages for the given number of spans (or mark as unused if no +//! partial unmappings) +static void _rpmalloc_span_unmap(span_t *span) { + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || + (span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || + !(span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + + int is_master = !!(span->flags & SPAN_FLAG_MASTER); + span_t *master = + is_master ? span + : ((span_t *)pointer_offset( + span, -(intptr_t)((uintptr_t)span->offset_from_master * + _memory_span_size))); + rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); + + size_t span_count = span->span_count; + if (!is_master) { + // Directly unmap subspans (unless huge pages, in which case we defer and + // unmap entire page range with master) + rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted"); + if (_memory_span_size >= _memory_page_size) + _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0); + } else { + // Special double flag to denote an unmapped master + // It must be kept in memory since span header must be used + span->flags |= + SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER; + _rpmalloc_stat_add(&_unmapped_master_spans, 1); + } + + if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { + // Everything unmapped, unmap the master span with release flag to unmap the + // entire range of the super span + rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && + !!(master->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + size_t unmap_count = master->span_count; + if (_memory_span_size < _memory_page_size) + unmap_count = master->total_spans; + _rpmalloc_stat_sub(&_master_spans, 1); + _rpmalloc_stat_sub(&_unmapped_master_spans, 1); + _rpmalloc_unmap(master, unmap_count * _memory_span_size, + master->align_offset, + (size_t)master->total_spans * _memory_span_size); + } +} + +//! Move the span (used for small or medium allocations) to the heap thread +//! cache +static void _rpmalloc_span_release_to_cache(heap_t *heap, span_t *span) { + rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted"); + rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, + "Invalid span size class"); + rpmalloc_assert(span->span_count == 1, "Invalid span count"); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + atomic_decr32(&heap->span_use[0].current); +#endif + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (!heap->finalize) { + _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache); + _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache); + if (heap->size_class[span->size_class].cache) + _rpmalloc_heap_cache_insert(heap, + heap->size_class[span->size_class].cache); + heap->size_class[span->size_class].cache = span; + } else { + _rpmalloc_span_unmap(span); + } +} + +//! Initialize a (partial) free list up to next system memory page, while +//! reserving the first block as allocated, returning number of blocks in list +static uint32_t free_list_partial_init(void **list, void **first_block, + void *page_start, void *block_start, + uint32_t block_count, + uint32_t block_size) { + rpmalloc_assert(block_count, "Internal failure"); + *first_block = block_start; + if (block_count > 1) { + void *free_block = pointer_offset(block_start, block_size); + void *block_end = + pointer_offset(block_start, (size_t)block_size * block_count); + // If block size is less than half a memory page, bound init to next memory + // page boundary + if (block_size < (_memory_page_size >> 1)) { + void *page_end = pointer_offset(page_start, _memory_page_size); + if (page_end < block_end) + block_end = page_end; + } + *list = free_block; + block_count = 2; + void *next_block = pointer_offset(free_block, block_size); + while (next_block < block_end) { + *((void **)free_block) = next_block; + free_block = next_block; + ++block_count; + next_block = pointer_offset(next_block, block_size); + } + *((void **)free_block) = 0; + } else { + *list = 0; + } + return block_count; +} + +//! Initialize an unused span (from cache or mapped) to be new active span, +//! putting the initial free list in heap class free list +static void *_rpmalloc_span_initialize_new(heap_t *heap, + heap_size_class_t *heap_size_class, + span_t *span, uint32_t class_idx) { + rpmalloc_assert(span->span_count == 1, "Internal failure"); + size_class_t *size_class = _memory_size_class + class_idx; + span->size_class = class_idx; + span->heap = heap; + span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; + span->block_size = size_class->block_size; + span->block_count = size_class->block_count; + span->free_list = 0; + span->list_size = 0; + atomic_store_ptr_release(&span->free_list_deferred, 0); + + // Setup free list. Only initialize one system page worth of free blocks in + // list + void *block; + span->free_list_limit = + free_list_partial_init(&heap_size_class->free_list, &block, span, + pointer_offset(span, SPAN_HEADER_SIZE), + size_class->block_count, size_class->block_size); + // Link span as partial if there remains blocks to be initialized as free + // list, or full if fully initialized + if (span->free_list_limit < span->block_count) { + _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span); + span->used_count = span->free_list_limit; + } else { +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + span->used_count = span->block_count; + } + return block; +} + +static void _rpmalloc_span_extract_free_list_deferred(span_t *span) { + // We need acquire semantics on the CAS operation since we are interested in + // the list size Refer to _rpmalloc_deallocate_defer_small_or_medium for + // further comments on this dependency + do { + span->free_list = + atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (span->free_list == INVALID_POINTER); + span->used_count -= span->list_size; + span->list_size = 0; + atomic_store_ptr_release(&span->free_list_deferred, 0); +} + +static int _rpmalloc_span_is_fully_utilized(span_t *span) { + rpmalloc_assert(span->free_list_limit <= span->block_count, + "Span free list corrupted"); + return !span->free_list && (span->free_list_limit >= span->block_count); +} + +static int _rpmalloc_span_finalize(heap_t *heap, size_t iclass, span_t *span, + span_t **list_head) { + void *free_list = heap->size_class[iclass].free_list; + span_t *class_span = (span_t *)((uintptr_t)free_list & _memory_span_mask); + if (span == class_span) { + // Adopt the heap class free list back into the span free list + void *block = span->free_list; + void *last_block = 0; + while (block) { + last_block = block; + block = *((void **)block); + } + uint32_t free_count = 0; + block = free_list; + while (block) { + ++free_count; + block = *((void **)block); + } + if (last_block) { + *((void **)last_block) = free_list; + } else { + span->free_list = free_list; + } + heap->size_class[iclass].free_list = 0; + span->used_count -= free_count; + } + // If this assert triggers you have memory leaks + rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected"); + if (span->list_size == span->used_count) { + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current); + // This function only used for spans in double linked lists + if (list_head) + _rpmalloc_span_double_link_list_remove(list_head, span); + _rpmalloc_span_unmap(span); + return 1; + } + return 0; +} + +//////////// +/// +/// Global cache +/// +////// + +#if ENABLE_GLOBAL_CACHE + +//! Finalize a global cache +static void _rpmalloc_global_cache_finalize(global_cache_t *cache) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + for (size_t ispan = 0; ispan < cache->count; ++ispan) + _rpmalloc_span_unmap(cache->span[ispan]); + cache->count = 0; + + while (cache->overflow) { + span_t *span = cache->overflow; + cache->overflow = span->next; + _rpmalloc_span_unmap(span); + } + + atomic_store32_release(&cache->lock, 0); +} + +static void _rpmalloc_global_cache_insert_spans(span_t **span, + size_t span_count, + size_t count) { + const size_t cache_limit = + (span_count == 1) ? GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE + : GLOBAL_CACHE_MULTIPLIER * + (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + + global_cache_t *cache = &_memory_span_cache[span_count - 1]; + + size_t insert_count = count; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->insert_count += count; +#endif + if ((cache->count + insert_count) > cache_limit) + insert_count = cache_limit - cache->count; + + memcpy(cache->span + cache->count, span, sizeof(span_t *) * insert_count); + cache->count += (uint32_t)insert_count; + +#if ENABLE_UNLIMITED_CACHE + while (insert_count < count) { +#else + // Enable unlimited cache if huge pages, or we will leak since it is unlikely + // that an entire huge page will be unmapped, and we're unable to partially + // decommit a huge page + while ((_memory_page_size > _memory_span_size) && (insert_count < count)) { +#endif + span_t *current_span = span[insert_count++]; + current_span->next = cache->overflow; + cache->overflow = current_span; + } + atomic_store32_release(&cache->lock, 0); + + span_t *keep = 0; + for (size_t ispan = insert_count; ispan < count; ++ispan) { + span_t *current_span = span[ispan]; + // Keep master spans that has remaining subspans to avoid dangling them + if ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) > + (int32_t)current_span->span_count)) { + current_span->next = keep; + keep = current_span; + } else { + _rpmalloc_span_unmap(current_span); + } + } + + if (keep) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + size_t islot = 0; + while (keep) { + for (; islot < cache->count; ++islot) { + span_t *current_span = cache->span[islot]; + if (!(current_span->flags & SPAN_FLAG_MASTER) || + ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) <= + (int32_t)current_span->span_count))) { + _rpmalloc_span_unmap(current_span); + cache->span[islot] = keep; + break; + } + } + if (islot == cache->count) + break; + keep = keep->next; + } + + if (keep) { + span_t *tail = keep; + while (tail->next) + tail = tail->next; + tail->next = cache->overflow; + cache->overflow = keep; + } + + atomic_store32_release(&cache->lock, 0); + } +} + +static size_t _rpmalloc_global_cache_extract_spans(span_t **span, + size_t span_count, + size_t count) { + global_cache_t *cache = &_memory_span_cache[span_count - 1]; + + size_t extract_count = 0; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->extract_count += count; +#endif + size_t want = count - extract_count; + if (want > cache->count) + want = cache->count; + + memcpy(span + extract_count, cache->span + (cache->count - want), + sizeof(span_t *) * want); + cache->count -= (uint32_t)want; + extract_count += want; + + while ((extract_count < count) && cache->overflow) { + span_t *current_span = cache->overflow; + span[extract_count++] = current_span; + cache->overflow = current_span->next; + } + +#if ENABLE_ASSERTS + for (size_t ispan = 0; ispan < extract_count; ++ispan) { + rpmalloc_assert(span[ispan]->span_count == span_count, + "Global cache span count mismatch"); + } +#endif + + atomic_store32_release(&cache->lock, 0); + + return extract_count; +} + +#endif + +//////////// +/// +/// Heap control +/// +////// + +static void _rpmalloc_deallocate_huge(span_t *); + +//! Store the given spans as reserve in the given heap +static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, + span_t *reserve, + size_t reserve_span_count) { + heap->span_reserve_master = master; + heap->span_reserve = reserve; + heap->spans_reserved = (uint32_t)reserve_span_count; +} + +//! Adopt the deferred span cache list, optionally extracting the first single +//! span for immediate re-use +static void _rpmalloc_heap_cache_adopt_deferred(heap_t *heap, + span_t **single_span) { + span_t *span = (span_t *)((void *)atomic_exchange_ptr_acquire( + &heap->span_free_deferred, 0)); + while (span) { + span_t *next_span = (span_t *)span->free_list; + rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted"); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; + _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], + span); +#endif + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } else { + if (span->size_class == SIZE_CLASS_HUGE) { + _rpmalloc_deallocate_huge(span); + } else { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, + "Span size class invalid"); + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span); +#endif + uint32_t idx = span->span_count - 1; + _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred); + _rpmalloc_stat_dec(&heap->span_use[idx].current); + if (!idx && single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } + } + span = next_span; + } +} + +static void _rpmalloc_heap_unmap(heap_t *heap) { + if (!heap->master_heap) { + if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) { + span_t *span = (span_t *)((uintptr_t)heap & _memory_span_mask); + _rpmalloc_span_unmap(span); + } + } else { + if (atomic_decr32(&heap->master_heap->child_count) == 0) { + _rpmalloc_heap_unmap(heap->master_heap); + } + } +} + +static void _rpmalloc_heap_global_finalize(heap_t *heap) { + if (heap->finalize++ > 1) { + --heap->finalize; + return; + } + + _rpmalloc_heap_finalize(heap); + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + + if (heap->full_span_count) { + --heap->finalize; + return; + } + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].free_list || + heap->size_class[iclass].partial_span) { + --heap->finalize; + return; + } + } + // Heap is now completely free, unmap and remove from heap list + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap_t *list_heap = _memory_heaps[list_idx]; + if (list_heap == heap) { + _memory_heaps[list_idx] = heap->next_heap; + } else { + while (list_heap->next_heap != heap) + list_heap = list_heap->next_heap; + list_heap->next_heap = heap->next_heap; + } + + _rpmalloc_heap_unmap(heap); +} + +//! Insert a single span into thread heap cache, releasing to global cache if +//! overflow +static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span) { + if (UNEXPECTED(heap->finalize != 0)) { + _rpmalloc_span_unmap(span); + _rpmalloc_heap_global_finalize(heap); + return; + } +#if ENABLE_THREAD_CACHE + size_t span_count = span->span_count; + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache); + if (span_count == 1) { + span_cache_t *span_cache = &heap->span_cache; + span_cache->span[span_cache->count++] = span; + if (span_cache->count == MAX_THREAD_SPAN_CACHE) { + const size_t remain_count = + MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, + THREAD_SPAN_CACHE_TRANSFER * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, + THREAD_SPAN_CACHE_TRANSFER); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, + span_count, + THREAD_SPAN_CACHE_TRANSFER); +#else + for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } else { + size_t cache_idx = span_count - 2; + span_large_cache_t *span_cache = heap->span_large_cache + cache_idx; + span_cache->span[span_cache->count++] = span; + const size_t cache_limit = + (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + if (span_cache->count == cache_limit) { + const size_t transfer_limit = 2 + (cache_limit >> 2); + const size_t transfer_count = + (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit + ? THREAD_SPAN_LARGE_CACHE_TRANSFER + : transfer_limit); + const size_t remain_count = cache_limit - transfer_count; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, + transfer_count * span_count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, + transfer_count); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, + span_count, transfer_count); +#else + for (size_t ispan = 0; ispan < transfer_count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } +#else + (void)sizeof(heap); + _rpmalloc_span_unmap(span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t *_rpmalloc_heap_thread_cache_extract(heap_t *heap, + size_t span_count) { + span_t *span = 0; +#if ENABLE_THREAD_CACHE + span_cache_t *span_cache; + if (span_count == 1) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); + if (span_cache->count) { + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache); + return span_cache->span[--span_cache->count]; + } +#endif + return span; +} + +static span_t *_rpmalloc_heap_thread_cache_deferred_extract(heap_t *heap, + size_t span_count) { + span_t *span = 0; + if (span_count == 1) { + _rpmalloc_heap_cache_adopt_deferred(heap, &span); + } else { + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + } + return span; +} + +static span_t *_rpmalloc_heap_reserved_extract(heap_t *heap, + size_t span_count) { + if (heap->spans_reserved >= span_count) + return _rpmalloc_span_map(heap, span_count); + return 0; +} + +//! Extract a span from the global cache +static span_t *_rpmalloc_heap_global_cache_extract(heap_t *heap, + size_t span_count) { +#if ENABLE_GLOBAL_CACHE +#if ENABLE_THREAD_CACHE + span_cache_t *span_cache; + size_t wanted_count; + if (span_count == 1) { + span_cache = &heap->span_cache; + wanted_count = THREAD_SPAN_CACHE_TRANSFER; + } else { + span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); + wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER; + } + span_cache->count = _rpmalloc_global_cache_extract_spans( + span_cache->span, span_count, wanted_count); + if (span_cache->count) { + _rpmalloc_stat_add64(&heap->global_to_thread, + span_count * span_cache->count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, + span_cache->count); + return span_cache->span[--span_cache->count]; + } +#else + span_t *span = 0; + size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1); + if (count) { + _rpmalloc_stat_add64(&heap->global_to_thread, + span_count * count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, + count); + return span; + } +#endif +#endif + (void)sizeof(heap); + (void)sizeof(span_count); + return 0; +} + +static void _rpmalloc_inc_span_statistics(heap_t *heap, size_t span_count, + uint32_t class_idx) { + (void)sizeof(heap); + (void)sizeof(span_count); + (void)sizeof(class_idx); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + uint32_t idx = (uint32_t)span_count - 1; + uint32_t current_count = + (uint32_t)atomic_incr32(&heap->span_use[idx].current); + if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high)) + atomic_store32(&heap->span_use[idx].high, (int32_t)current_count); + _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, + heap->size_class_use[class_idx].spans_peak); +#endif +} + +//! Get a span from one of the cache levels (thread cache, reserved, global +//! cache) or fallback to mapping more memory +static span_t * +_rpmalloc_heap_extract_new_span(heap_t *heap, + heap_size_class_t *heap_size_class, + size_t span_count, uint32_t class_idx) { + span_t *span; +#if ENABLE_THREAD_CACHE + if (heap_size_class && heap_size_class->cache) { + span = heap_size_class->cache; + heap_size_class->cache = + (heap->span_cache.count + ? heap->span_cache.span[--heap->span_cache.count] + : 0); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } +#endif + (void)sizeof(class_idx); + // Allow 50% overhead to increase cache hits + size_t base_span_count = span_count; + size_t limit_span_count = + (span_count > 2) ? (span_count + (span_count >> 1)) : span_count; + if (limit_span_count > LARGE_CLASS_COUNT) + limit_span_count = LARGE_CLASS_COUNT; + do { + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_global_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_reserved_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + ++span_count; + } while (span_count <= limit_span_count); + // Final fallback, map in more virtual memory + span = _rpmalloc_span_map(heap, base_span_count); + _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx); + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls); + return span; +} + +static void _rpmalloc_heap_initialize(heap_t *heap) { + _rpmalloc_memset_const(heap, 0, sizeof(heap_t)); + // Get a new heap ID + heap->id = 1 + atomic_incr32(&_memory_heap_id); + + // Link in heap in heap ID map + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap->next_heap = _memory_heaps[list_idx]; + _memory_heaps[list_idx] = heap; +} + +static void _rpmalloc_heap_orphan(heap_t *heap, int first_class) { + heap->owner_thread = (uintptr_t)-1; +#if RPMALLOC_FIRST_CLASS_HEAPS + heap_t **heap_list = + (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps); +#else + (void)sizeof(first_class); + heap_t **heap_list = &_memory_orphan_heaps; +#endif + heap->next_orphan = *heap_list; + *heap_list = heap; +} + +//! Allocate a new heap from newly mapped memory pages +static heap_t *_rpmalloc_heap_allocate_new(void) { + // Map in pages for a 16 heaps. If page size is greater than required size for + // this, map a page and use first part for heaps and remaining part for spans + // for allocations. Adds a lot of complexity, but saves a lot of memory on + // systems where page size > 64 spans (4MiB) + size_t heap_size = sizeof(heap_t); + size_t aligned_heap_size = 16 * ((heap_size + 15) / 16); + size_t request_heap_count = 16; + size_t heap_span_count = ((aligned_heap_size * request_heap_count) + + sizeof(span_t) + _memory_span_size - 1) / + _memory_span_size; + size_t block_size = _memory_span_size * heap_span_count; + size_t span_count = heap_span_count; + span_t *span = 0; + // If there are global reserved spans, use these first + if (_memory_global_reserve_count >= heap_span_count) { + span = _rpmalloc_global_get_reserved_spans(heap_span_count); + } + if (!span) { + if (_memory_page_size > block_size) { + span_count = _memory_page_size / _memory_span_size; + block_size = _memory_page_size; + // If using huge pages, make sure to grab enough heaps to avoid + // reallocating a huge page just to serve new heaps + size_t possible_heap_count = + (block_size - sizeof(span_t)) / aligned_heap_size; + if (possible_heap_count >= (request_heap_count * 16)) + request_heap_count *= 16; + else if (possible_heap_count < request_heap_count) + request_heap_count = possible_heap_count; + heap_span_count = ((aligned_heap_size * request_heap_count) + + sizeof(span_t) + _memory_span_size - 1) / + _memory_span_size; + } + + size_t align_offset = 0; + span = (span_t *)_rpmalloc_mmap(block_size, &align_offset); + if (!span) + return 0; + + // Master span will contain the heaps + _rpmalloc_stat_inc(&_master_spans); + _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset); + } + + size_t remain_size = _memory_span_size - sizeof(span_t); + heap_t *heap = (heap_t *)pointer_offset(span, sizeof(span_t)); + _rpmalloc_heap_initialize(heap); + + // Put extra heaps as orphans + size_t num_heaps = remain_size / aligned_heap_size; + if (num_heaps < request_heap_count) + num_heaps = request_heap_count; + atomic_store32(&heap->child_count, (int32_t)num_heaps - 1); + heap_t *extra_heap = (heap_t *)pointer_offset(heap, aligned_heap_size); + while (num_heaps > 1) { + _rpmalloc_heap_initialize(extra_heap); + extra_heap->master_heap = heap; + _rpmalloc_heap_orphan(extra_heap, 1); + extra_heap = (heap_t *)pointer_offset(extra_heap, aligned_heap_size); + --num_heaps; + } + + if (span_count > heap_span_count) { + // Cap reserved spans + size_t remain_count = span_count - heap_span_count; + size_t reserve_count = + (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count + : remain_count); + span_t *remain_span = + (span_t *)pointer_offset(span, heap_span_count * _memory_span_size); + _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count); + + if (remain_count > reserve_count) { + // Set to global reserved spans + remain_span = (span_t *)pointer_offset(remain_span, + reserve_count * _memory_span_size); + reserve_count = remain_count - reserve_count; + _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count); + } + } + + return heap; +} + +static heap_t *_rpmalloc_heap_extract_orphan(heap_t **heap_list) { + heap_t *heap = *heap_list; + *heap_list = (heap ? heap->next_orphan : 0); + return heap; +} + +//! Allocate a new heap, potentially reusing a previously orphaned heap +static heap_t *_rpmalloc_heap_allocate(int first_class) { + heap_t *heap = 0; + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (first_class == 0) + heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps); +#if RPMALLOC_FIRST_CLASS_HEAPS + if (!heap) + heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps); +#endif + if (!heap) + heap = _rpmalloc_heap_allocate_new(); + atomic_store32_release(&_memory_global_lock, 0); + if (heap) + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + return heap; +} + +static void _rpmalloc_heap_release(void *heapptr, int first_class, + int release_cache) { + heap_t *heap = (heap_t *)heapptr; + if (!heap) + return; + // Release thread cache spans back to global cache + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + if (release_cache || heap->finalize) { +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + if (heap->finalize) { + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + } else { + _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * + (iclass + 1) * + _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, + span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, + span_cache->count); + } +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + } + + if (get_thread_heap_raw() == heap) + set_thread_heap(0); + +#if ENABLE_STATISTICS + atomic_decr32(&_memory_active_heaps); + rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, + "Still active heaps during finalization"); +#endif + + // If we are forcibly terminating with _exit the state of the + // lock atomic is unknown and it's best to just go ahead and exit + if (get_thread_id() != _rpmalloc_main_thread_id) { + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + } + _rpmalloc_heap_orphan(heap, first_class); + atomic_store32_release(&_memory_global_lock, 0); +} + +static void _rpmalloc_heap_release_raw(void *heapptr, int release_cache) { + _rpmalloc_heap_release(heapptr, 0, release_cache); +} + +static void _rpmalloc_heap_release_raw_fc(void *heapptr) { + _rpmalloc_heap_release_raw(heapptr, 1); +} + +static void _rpmalloc_heap_finalize(heap_t *heap) { + if (heap->spans_reserved) { + span_t *span = _rpmalloc_span_map(heap, heap->spans_reserved); + _rpmalloc_span_unmap(span); + heap->spans_reserved = 0; + } + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].cache) + _rpmalloc_span_unmap(heap->size_class[iclass].cache); + heap->size_class[iclass].cache = 0; + span_t *span = heap->size_class[iclass].partial_span; + while (span) { + span_t *next = span->next; + _rpmalloc_span_finalize(heap, iclass, span, + &heap->size_class[iclass].partial_span); + span = next; + } + // If class still has a free list it must be a full span + if (heap->size_class[iclass].free_list) { + span_t *class_span = + (span_t *)((uintptr_t)heap->size_class[iclass].free_list & + _memory_span_mask); + span_t **list = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + list = &heap->full_span[iclass]; +#endif + --heap->full_span_count; + if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) { + if (list) + _rpmalloc_span_double_link_list_remove(list, class_span); + _rpmalloc_span_double_link_list_add( + &heap->size_class[iclass].partial_span, class_span); + } + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), + "Heaps still active during finalization"); +} + +//////////// +/// +/// Allocation entry points +/// +////// + +//! Pop first block from a free list +static void *free_list_pop(void **list) { + void *block = *list; + *list = *((void **)block); + return block; +} + +//! Allocate a small/medium sized memory block from the given heap +static void *_rpmalloc_allocate_from_heap_fallback( + heap_t *heap, heap_size_class_t *heap_size_class, uint32_t class_idx) { + span_t *span = heap_size_class->partial_span; + rpmalloc_assume(heap != 0); + if (EXPECTED(span != 0)) { + rpmalloc_assert(span->block_count == + _memory_size_class[span->size_class].block_count, + "Span block count corrupted"); + rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), + "Internal failure"); + void *block; + if (span->free_list) { + // Span local free list is not empty, swap to size class free list + block = free_list_pop(&span->free_list); + heap_size_class->free_list = span->free_list; + span->free_list = 0; + } else { + // If the span did not fully initialize free list, link up another page + // worth of blocks + void *block_start = pointer_offset( + span, SPAN_HEADER_SIZE + + ((size_t)span->free_list_limit * span->block_size)); + span->free_list_limit += free_list_partial_init( + &heap_size_class->free_list, &block, + (void *)((uintptr_t)block_start & ~(_memory_page_size - 1)), + block_start, span->block_count - span->free_list_limit, + span->block_size); + } + rpmalloc_assert(span->free_list_limit <= span->block_count, + "Span block count corrupted"); + span->used_count = span->free_list_limit; + + // Swap in deferred free list if present + if (atomic_load_ptr(&span->free_list_deferred)) + _rpmalloc_span_extract_free_list_deferred(span); + + // If span is still not fully utilized keep it in partial list and early + // return block + if (!_rpmalloc_span_is_fully_utilized(span)) + return block; + + // The span is fully utilized, unlink from partial list and add to fully + // utilized list + _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, + span); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + return block; + } + + // Find a span in one of the cache levels + span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx); + if (EXPECTED(span != 0)) { + // Mark span as owned by this heap and set base data, return first block + return _rpmalloc_span_initialize_new(heap, heap_size_class, span, + class_idx); + } + + return 0; +} + +//! Allocate a small sized memory block from the given heap +static void *_rpmalloc_allocate_small(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + // Small sizes have unique size classes + const uint32_t class_idx = + (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); + heap_size_class_t *heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, + class_idx); +} + +//! Allocate a medium sized memory block from the given heap +static void *_rpmalloc_allocate_medium(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + // Calculate the size class index and do a dependent lookup of the final class + // index (in case of merged classes) + const uint32_t base_idx = + (uint32_t)(SMALL_CLASS_COUNT + + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); + const uint32_t class_idx = _memory_size_class[base_idx].class_idx; + heap_size_class_t *heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, + class_idx); +} + +//! Allocate a large sized memory block from the given heap +static void *_rpmalloc_allocate_large(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + // Calculate number of needed max sized spans (including header) + // Since this function is never called if size > LARGE_SIZE_LIMIT + // the span_count is guaranteed to be <= LARGE_CLASS_COUNT + size += SPAN_HEADER_SIZE; + size_t span_count = size >> _memory_span_size_shift; + if (size & (_memory_span_size - 1)) + ++span_count; + + // Find a span in one of the cache levels + span_t *span = + _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE); + if (!span) + return span; + + // Mark span as owned by this heap and set base data + rpmalloc_assert(span->span_count >= span_count, "Internal failure"); + span->size_class = SIZE_CLASS_LARGE; + span->heap = heap; + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a huge block by mapping memory pages directly +static void *_rpmalloc_allocate_huge(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + size += SPAN_HEADER_SIZE; + size_t num_pages = size >> _memory_page_size_shift; + if (size & (_memory_page_size - 1)) + ++num_pages; + size_t align_offset = 0; + span_t *span = + (span_t *)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset); + if (!span) + return span; + + // Store page count in span_count + span->size_class = SIZE_CLASS_HUGE; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a block of the given size +static void *_rpmalloc_allocate(heap_t *heap, size_t size) { + _rpmalloc_stat_add64(&_allocation_counter, 1); + if (EXPECTED(size <= SMALL_SIZE_LIMIT)) + return _rpmalloc_allocate_small(heap, size); + else if (size <= _memory_medium_size_limit) + return _rpmalloc_allocate_medium(heap, size); + else if (size <= LARGE_SIZE_LIMIT) + return _rpmalloc_allocate_large(heap, size); + return _rpmalloc_allocate_huge(heap, size); +} + +static void *_rpmalloc_aligned_allocate(heap_t *heap, size_t alignment, + size_t size) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_allocate(heap, size); + +#if ENABLE_VALIDATE_ARGS + if ((size + alignment) < size) { + errno = EINVAL; + return 0; + } + if (alignment & (alignment - 1)) { + errno = EINVAL; + return 0; + } +#endif + + if ((alignment <= SPAN_HEADER_SIZE) && + ((size + SPAN_HEADER_SIZE) < _memory_medium_size_limit)) { + // If alignment is less or equal to span header size (which is power of + // two), and size aligned to span header size multiples is less than size + + // alignment, then use natural alignment of blocks to provide alignment + size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & + ~(uintptr_t)(SPAN_HEADER_SIZE - 1) + : SPAN_HEADER_SIZE; + rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), + "Failed alignment calculation"); + if (multiple_size <= (size + alignment)) + return _rpmalloc_allocate(heap, multiple_size); + } + + void *ptr = 0; + size_t align_mask = alignment - 1; + if (alignment <= _memory_page_size) { + ptr = _rpmalloc_allocate(heap, size + alignment); + if ((uintptr_t)ptr & align_mask) { + ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + // Mark as having aligned blocks + span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + } + return ptr; + } + + // Fallback to mapping new pages for this request. Since pointers passed + // to rpfree must be able to reach the start of the span by bitmasking of + // the address with the span size, the returned aligned pointer from this + // function must be with a span size of the start of the mapped area. + // In worst case this requires us to loop and map pages until we get a + // suitable memory address. It also means we can never align to span size + // or greater, since the span header will push alignment more than one + // span size away from span start (thus causing pointer mask to give us + // an invalid span start on free) + if (alignment & align_mask) { + errno = EINVAL; + return 0; + } + if (alignment >= _memory_span_size) { + errno = EINVAL; + return 0; + } + + size_t extra_pages = alignment / _memory_page_size; + + // Since each span has a header, we will at least need one extra memory page + size_t num_pages = 1 + (size / _memory_page_size); + if (size & (_memory_page_size - 1)) + ++num_pages; + + if (extra_pages > num_pages) + num_pages = 1 + extra_pages; + + size_t original_pages = num_pages; + size_t limit_pages = (_memory_span_size / _memory_page_size) * 2; + if (limit_pages < (original_pages * 2)) + limit_pages = original_pages * 2; + + size_t mapped_size, align_offset; + span_t *span; + +retry: + align_offset = 0; + mapped_size = num_pages * _memory_page_size; + + span = (span_t *)_rpmalloc_mmap(mapped_size, &align_offset); + if (!span) { + errno = ENOMEM; + return 0; + } + ptr = pointer_offset(span, SPAN_HEADER_SIZE); + + if ((uintptr_t)ptr & align_mask) + ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + + if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || + (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || + (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { + _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size); + ++num_pages; + if (num_pages > limit_pages) { + errno = EINVAL; + return 0; + } + goto retry; + } + + // Store page count in span_count + span->size_class = SIZE_CLASS_HUGE; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + _rpmalloc_stat_add64(&_allocation_counter, 1); + + return ptr; +} + +//////////// +/// +/// Deallocation entry points +/// +////// + +//! Deallocate the given small/medium memory block in the current thread local +//! heap +static void _rpmalloc_deallocate_direct_small_or_medium(span_t *span, + void *block) { + heap_t *heap = span->heap; + rpmalloc_assert(heap->owner_thread == get_thread_id() || + !heap->owner_thread || heap->finalize, + "Internal failure"); + // Add block to free list + if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) { + span->used_count = span->block_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], + span); +#endif + _rpmalloc_span_double_link_list_add( + &heap->size_class[span->size_class].partial_span, span); + --heap->full_span_count; + } + *((void **)block) = span->free_list; + --span->used_count; + span->free_list = block; + if (UNEXPECTED(span->used_count == span->list_size)) { + // If there are no used blocks it is guaranteed that no other external + // thread is accessing the span + if (span->used_count) { + // Make sure we have synchronized the deferred list and list size by using + // acquire semantics and guarantee that no external thread is accessing + // span concurrently + void *free_list; + do { + free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, + INVALID_POINTER); + } while (free_list == INVALID_POINTER); + atomic_store_ptr_release(&span->free_list_deferred, free_list); + } + _rpmalloc_span_double_link_list_remove( + &heap->size_class[span->size_class].partial_span, span); + _rpmalloc_span_release_to_cache(heap, span); + } +} + +static void _rpmalloc_deallocate_defer_free_span(heap_t *heap, span_t *span) { + if (span->size_class != SIZE_CLASS_HUGE) + _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred); + // This list does not need ABA protection, no mutable side state + do { + span->free_list = (void *)atomic_load_ptr(&heap->span_free_deferred); + } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list)); +} + +//! Put the block in the deferred free list of the owning span +static void _rpmalloc_deallocate_defer_small_or_medium(span_t *span, + void *block) { + // The memory ordering here is a bit tricky, to avoid having to ABA protect + // the deferred free list to avoid desynchronization of list and list size + // we need to have acquire semantics on successful CAS of the pointer to + // guarantee the list_size variable validity + release semantics on pointer + // store + void *free_list; + do { + free_list = + atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (free_list == INVALID_POINTER); + *((void **)block) = free_list; + uint32_t free_count = ++span->list_size; + int all_deferred_free = (free_count == span->block_count); + atomic_store_ptr_release(&span->free_list_deferred, block); + if (all_deferred_free) { + // Span was completely freed by this block. Due to the INVALID_POINTER spin + // lock no other thread can reach this state simultaneously on this span. + // Safe to move to owner heap deferred cache + _rpmalloc_deallocate_defer_free_span(span->heap, span); + } +} + +static void _rpmalloc_deallocate_small_or_medium(span_t *span, void *p) { + _rpmalloc_stat_inc_free(span->heap, span->size_class); + if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { + // Realign pointer to block start + void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); + } + // Check if block belongs to this heap or if deallocation should be deferred +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = + (span->heap->owner_thread && + (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = + ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (!defer) + _rpmalloc_deallocate_direct_small_or_medium(span, p); + else + _rpmalloc_deallocate_defer_small_or_medium(span, p); +} + +//! Deallocate the given large memory block to the current heap +static void _rpmalloc_deallocate_large(span_t *span) { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || + !(span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || + (span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + // We must always defer (unless finalizing) if from another heap since we + // cannot touch the list or counters of another heap +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = + (span->heap->owner_thread && + (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = + ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + // Decrease counter + size_t idx = span->span_count - 1; + atomic_decr32(&span->heap->span_use[idx].current); +#endif + heap_t *heap = span->heap; + rpmalloc_assert(heap, "No thread heap"); +#if ENABLE_THREAD_CACHE + const int set_as_reserved = + ((span->span_count > 1) && (heap->span_cache.count == 0) && + !heap->finalize && !heap->spans_reserved); +#else + const int set_as_reserved = + ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved); +#endif + if (set_as_reserved) { + heap->span_reserve = span; + heap->spans_reserved = span->span_count; + if (span->flags & SPAN_FLAG_MASTER) { + heap->span_reserve_master = span; + } else { // SPAN_FLAG_SUBSPAN + span_t *master = (span_t *)pointer_offset( + span, + -(intptr_t)((size_t)span->offset_from_master * _memory_span_size)); + heap->span_reserve_master = master; + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); + rpmalloc_assert(atomic_load32(&master->remaining_spans) >= + (int32_t)span->span_count, + "Master span count corrupted"); + } + _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved); + } else { + // Insert into cache list + _rpmalloc_heap_cache_insert(heap, span); + } +} + +//! Deallocate the given huge span +static void _rpmalloc_deallocate_huge(span_t *span) { + rpmalloc_assert(span->heap, "No span heap"); +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = + (span->heap->owner_thread && + (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = + ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif + + // Oversized allocation, page count is stored in span_count + size_t num_pages = span->span_count; + _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, + num_pages * _memory_page_size); + _rpmalloc_stat_sub(&_huge_pages_current, num_pages); +} + +//! Deallocate the given block +static void _rpmalloc_deallocate(void *p) { + _rpmalloc_stat_add64(&_deallocation_counter, 1); + // Grab the span (always at start of span, using span alignment) + span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); + if (UNEXPECTED(!span)) + return; + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) + _rpmalloc_deallocate_small_or_medium(span, p); + else if (span->size_class == SIZE_CLASS_LARGE) + _rpmalloc_deallocate_large(span); + else + _rpmalloc_deallocate_huge(span); +} + +//////////// +/// +/// Reallocation entry points +/// +////// + +static size_t _rpmalloc_usable_size(void *p); + +//! Reallocate the given block to the given size +static void *_rpmalloc_reallocate(heap_t *heap, void *p, size_t size, + size_t oldsize, unsigned int flags) { + if (p) { + // Grab the span using guaranteed span alignment + span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + // Small/medium sized block + rpmalloc_assert(span->span_count == 1, "Span counter corrupted"); + void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + uint32_t block_idx = block_offset / span->block_size; + void *block = + pointer_offset(blocks_start, (size_t)block_idx * span->block_size); + if (!oldsize) + oldsize = + (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block)); + if ((size_t)span->block_size >= size) { + // Still fits in block, never mind trying to save memory, but preserve + // data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else if (span->size_class == SIZE_CLASS_LARGE) { + // Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) + ++num_spans; + size_t current_spans = span->span_count; + void *block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_spans * _memory_span_size) - + (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) { + // Still fits in block, never mind trying to save memory, but preserve + // data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else { + // Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) + ++num_pages; + // Page count is stored in span_count + size_t current_pages = span->span_count; + void *block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_pages * _memory_page_size) - + (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { + // Still fits in block, never mind trying to save memory, but preserve + // data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + oldsize = 0; + } + + if (!!(flags & RPMALLOC_GROW_OR_FAIL)) + return 0; + + // Size is greater than block size, need to allocate a new block and + // deallocate the old Avoid hysteresis by overallocating if increase is small + // (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + size_t new_size = + (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); + void *block = _rpmalloc_allocate(heap, new_size); + if (p && block) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < new_size ? oldsize : new_size); + _rpmalloc_deallocate(p); + } + + return block; +} + +static void *_rpmalloc_aligned_reallocate(heap_t *heap, void *ptr, + size_t alignment, size_t size, + size_t oldsize, unsigned int flags) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags); + + int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL); + size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0); + if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) { + if (no_alloc || (size >= (usablesize / 2))) + return ptr; + } + // Aligned alloc marks span as having aligned blocks + void *block = + (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0); + if (EXPECTED(block != 0)) { + if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) { + if (!oldsize) + oldsize = usablesize; + memcpy(block, ptr, oldsize < size ? oldsize : size); + } + _rpmalloc_deallocate(ptr); + } + return block; +} + +//////////// +/// +/// Initialization, finalization and utility +/// +////// + +//! Get the usable size of the given block +static size_t _rpmalloc_usable_size(void *p) { + // Grab the span using guaranteed span alignment + span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); + if (span->size_class < SIZE_CLASS_COUNT) { + // Small/medium block + void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + return span->block_size - + ((size_t)pointer_diff(p, blocks_start) % span->block_size); + } + if (span->size_class == SIZE_CLASS_LARGE) { + // Large block + size_t current_spans = span->span_count; + return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); + } + // Oversized block, page count is stored in span_count + size_t current_pages = span->span_count; + return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); +} + +//! Adjust and optimize the size class properties for the given class +static void _rpmalloc_adjust_size_class(size_t iclass) { + size_t block_size = _memory_size_class[iclass].block_size; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + // Check if previous size classes can be merged + if (iclass >= SMALL_CLASS_COUNT) { + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + // A class can be merged if number of pages and number of blocks are equal + if (_memory_size_class[prevclass].block_count == + _memory_size_class[iclass].block_count) + _rpmalloc_memcpy_const(_memory_size_class + prevclass, + _memory_size_class + iclass, + sizeof(_memory_size_class[iclass])); + else + break; + } + } +} + +//! Initialize the allocator and setup global data +extern inline int rpmalloc_initialize(void) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + return rpmalloc_initialize_config(0); +} + +int rpmalloc_initialize_config(const rpmalloc_config_t *config) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + _rpmalloc_initialized = 1; + + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + else + _rpmalloc_memset_const(&_memory_config, 0, sizeof(rpmalloc_config_t)); + + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + _memory_config.memory_map = _rpmalloc_mmap_os; + _memory_config.memory_unmap = _rpmalloc_unmap_os; + } + +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + _memory_map_granularity = system_info.dwAllocationGranularity; +#else + _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE); +#endif + +#if RPMALLOC_CONFIGURABLE + _memory_page_size = _memory_config.page_size; +#else + _memory_page_size = 0; +#endif + _memory_huge_pages = 0; + if (!_memory_page_size) { +#if PLATFORM_WINDOWS + _memory_page_size = system_info.dwPageSize; +#else + _memory_page_size = _memory_map_granularity; + if (_memory_config.enable_huge_pages) { +#if defined(__linux__) + size_t huge_page_size = 0; + FILE *meminfo = fopen("/proc/meminfo", "r"); + if (meminfo) { + char line[128]; + while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { + line[sizeof(line) - 1] = 0; + if (strstr(line, "Hugepagesize:")) + huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; + } + fclose(meminfo); + } + if (huge_page_size) { + _memory_huge_pages = 1; + _memory_page_size = huge_page_size; + _memory_map_granularity = huge_page_size; + } +#elif defined(__FreeBSD__) + int rc; + size_t sz = sizeof(rc); + + if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && + rc == 1) { + static size_t defsize = 2 * 1024 * 1024; + int nsize = 0; + size_t sizes[4] = {0}; + _memory_huge_pages = 1; + _memory_page_size = defsize; + if ((nsize = getpagesizes(sizes, 4)) >= 2) { + nsize--; + for (size_t csize = sizes[nsize]; nsize >= 0 && csize; + --nsize, csize = sizes[nsize]) { + //! Unlikely, but as a precaution.. + rpmalloc_assert(!(csize & (csize - 1)) && !(csize % 1024), + "Invalid page size"); + if (defsize < csize) { + _memory_page_size = csize; + break; + } + } + } + _memory_map_granularity = _memory_page_size; + } +#elif defined(__APPLE__) || defined(__NetBSD__) + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; +#endif + } +#endif + } else { + if (_memory_config.enable_huge_pages) + _memory_huge_pages = 1; + } + +#if PLATFORM_WINDOWS + if (_memory_config.enable_huge_pages) { + HANDLE token = 0; + size_t large_page_minimum = GetLargePageMinimum(); + if (large_page_minimum) + OpenProcessToken(GetCurrentProcess(), + TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); + if (token) { + LUID luid; + if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { + TOKEN_PRIVILEGES token_privileges; + memset(&token_privileges, 0, sizeof(token_privileges)); + token_privileges.PrivilegeCount = 1; + token_privileges.Privileges[0].Luid = luid; + token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { + if (GetLastError() == ERROR_SUCCESS) + _memory_huge_pages = 1; + } + } + CloseHandle(token); + } + if (_memory_huge_pages) { + if (large_page_minimum > _memory_page_size) + _memory_page_size = large_page_minimum; + if (large_page_minimum > _memory_map_granularity) + _memory_map_granularity = large_page_minimum; + } + } +#endif + + size_t min_span_size = 256; + size_t max_page_size; +#if UINTPTR_MAX > 0xFFFFFFFF + max_page_size = 4096ULL * 1024ULL * 1024ULL; +#else + max_page_size = 4 * 1024 * 1024; +#endif + if (_memory_page_size < min_span_size) + _memory_page_size = min_span_size; + if (_memory_page_size > max_page_size) + _memory_page_size = max_page_size; + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + +#if RPMALLOC_CONFIGURABLE + if (!_memory_config.span_size) { + _memory_span_size = _memory_default_span_size; + _memory_span_size_shift = _memory_default_span_size_shift; + _memory_span_mask = _memory_default_span_mask; + } else { + size_t span_size = _memory_config.span_size; + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while (_memory_span_size < span_size) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); + } +#endif + + _memory_span_map_count = + (_memory_config.span_map_count ? _memory_config.span_map_count + : DEFAULT_SPAN_MAP_COUNT); + if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + if ((_memory_page_size >= _memory_span_size) && + ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) + ? DEFAULT_SPAN_MAP_COUNT + : _memory_span_map_count; + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + _memory_config.span_map_count = _memory_span_map_count; + _memory_config.enable_huge_pages = _memory_huge_pages; + +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ + defined(__TINYC__) + if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc)) + return -1; +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + fls_key = FlsAlloc(&_rpmalloc_thread_destructor); +#endif + + // Setup all small and medium size classes + size_t iclass = 0; + _memory_size_class[iclass].block_size = SMALL_GRANULARITY; + _rpmalloc_adjust_size_class(iclass); + for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = iclass * SMALL_GRANULARITY; + _memory_size_class[iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(iclass); + } + // At least two blocks per span, then fall back to large allocations + _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > _memory_medium_size_limit) { + _memory_medium_size_limit = + SMALL_SIZE_LIMIT + (iclass * MEDIUM_GRANULARITY); + break; + } + _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + _memory_orphan_heaps = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + _memory_first_class_orphan_heaps = 0; +#endif +#if ENABLE_STATISTICS + atomic_store32(&_memory_active_heaps, 0); + atomic_store32(&_mapped_pages, 0); + _mapped_pages_peak = 0; + atomic_store32(&_master_spans, 0); + atomic_store32(&_mapped_total, 0); + atomic_store32(&_unmapped_total, 0); + atomic_store32(&_mapped_pages_os, 0); + atomic_store32(&_huge_pages_current, 0); + _huge_pages_peak = 0; +#endif + memset(_memory_heaps, 0, sizeof(_memory_heaps)); + atomic_store32_release(&_memory_global_lock, 0); + + rpmalloc_linker_reference(); + + // Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +void rpmalloc_finalize(void) { + rpmalloc_thread_finalize(1); + // rpmalloc_dump_statistics(stdout); + + if (_memory_global_reserve) { + atomic_add32(&_memory_global_reserve_master->remaining_spans, + -(int32_t)_memory_global_reserve_count); + _memory_global_reserve_master = 0; + _memory_global_reserve_count = 0; + _memory_global_reserve = 0; + } + atomic_store32_release(&_memory_global_lock, 0); + + // Free all thread caches and fully free spans + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t *heap = _memory_heaps[list_idx]; + while (heap) { + heap_t *next_heap = heap->next_heap; + heap->finalize = 1; + _rpmalloc_heap_global_finalize(heap); + heap = next_heap; + } + } + +#if ENABLE_GLOBAL_CACHE + // Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]); +#endif + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsFree(fls_key); + fls_key = 0; +#endif +#if ENABLE_STATISTICS + // If you hit these asserts you probably have memory leaks (perhaps global + // scope data doing dynamic allocations) or double frees in your code + rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected"); + rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, + "Memory leak detected"); +#endif + + _rpmalloc_initialized = 0; +} + +//! Initialize thread, assign heap +extern inline void rpmalloc_thread_initialize(void) { + if (!get_thread_heap_raw()) { + heap_t *heap = _rpmalloc_heap_allocate(0); + if (heap) { + _rpmalloc_stat_inc(&_memory_active_heaps); + set_thread_heap(heap); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, heap); +#endif + } + } +} + +//! Finalize thread, orphan heap +void rpmalloc_thread_finalize(int release_caches) { + heap_t *heap = get_thread_heap_raw(); + if (heap) + _rpmalloc_heap_release_raw(heap, release_caches); + set_thread_heap(0); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, 0); +#endif +} + +int rpmalloc_is_thread_initialized(void) { + return (get_thread_heap_raw() != 0) ? 1 : 0; +} + +const rpmalloc_config_t *rpmalloc_config(void) { return &_memory_config; } + +// Extern interface + +extern inline RPMALLOC_ALLOCATOR void *rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + heap_t *heap = get_thread_heap(); + return _rpmalloc_allocate(heap, size); +} + +extern inline void rpfree(void *ptr) { _rpmalloc_deallocate(ptr); } + +extern inline RPMALLOC_ALLOCATOR void *rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + heap_t *heap = get_thread_heap(); + void *block = _rpmalloc_allocate(heap, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void *rprealloc(void *ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + heap_t *heap = get_thread_heap(); + return _rpmalloc_reallocate(heap, ptr, size, 0, 0); +} + +extern RPMALLOC_ALLOCATOR void *rpaligned_realloc(void *ptr, size_t alignment, + size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + heap_t *heap = get_thread_heap(); + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, + flags); +} + +extern RPMALLOC_ALLOCATOR void *rpaligned_alloc(size_t alignment, size_t size) { + heap_t *heap = get_thread_heap(); + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpaligned_calloc(size_t alignment, size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void *block = rpaligned_alloc(alignment, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void *rpmemalign(size_t alignment, + size_t size) { + return rpaligned_alloc(alignment, size); +} + +extern inline int rpposix_memalign(void **memptr, size_t alignment, + size_t size) { + if (memptr) + *memptr = rpaligned_alloc(alignment, size); + else + return EINVAL; + return *memptr ? 0 : ENOMEM; +} + +extern inline size_t rpmalloc_usable_size(void *ptr) { + return (ptr ? _rpmalloc_usable_size(ptr) : 0); +} + +extern inline void rpmalloc_thread_collect(void) {} + +void rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats) { + memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); + heap_t *heap = get_thread_heap_raw(); + if (!heap) + return; + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + size_class_t *size_class = _memory_size_class + iclass; + span_t *span = heap->size_class[iclass].partial_span; + while (span) { + size_t free_count = span->list_size; + size_t block_count = size_class->block_count; + if (span->free_list_limit < block_count) + block_count = span->free_list_limit; + free_count += (block_count - span->used_count); + stats->sizecache += free_count * size_class->block_size; + span = span->next; + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + stats->spancache += span_cache->count * (iclass + 1) * _memory_span_size; + } +#endif + + span_t *deferred = (span_t *)atomic_load_ptr(&heap->span_free_deferred); + while (deferred) { + if (deferred->size_class != SIZE_CLASS_HUGE) + stats->spancache += (size_t)deferred->span_count * _memory_span_size; + deferred = (span_t *)deferred->free_list; + } + +#if ENABLE_STATISTICS + stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global); + stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread); + + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + stats->span_use[iclass].current = + (size_t)atomic_load32(&heap->span_use[iclass].current); + stats->span_use[iclass].peak = + (size_t)atomic_load32(&heap->span_use[iclass].high); + stats->span_use[iclass].to_global = + (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global); + stats->span_use[iclass].from_global = + (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global); + stats->span_use[iclass].to_cache = + (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache); + stats->span_use[iclass].from_cache = + (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache); + stats->span_use[iclass].to_reserved = + (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved); + stats->span_use[iclass].from_reserved = + (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved); + stats->span_use[iclass].map_calls = + (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls); + } + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + stats->size_use[iclass].alloc_current = + (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); + stats->size_use[iclass].alloc_peak = + (size_t)heap->size_class_use[iclass].alloc_peak; + stats->size_use[iclass].alloc_total = + (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total); + stats->size_use[iclass].free_total = + (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); + stats->size_use[iclass].spans_to_cache = + (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache); + stats->size_use[iclass].spans_from_cache = + (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache); + stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32( + &heap->size_class_use[iclass].spans_from_reserved); + stats->size_use[iclass].map_calls = + (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls); + } +#endif +} + +void rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats) { + memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); +#if ENABLE_STATISTICS + stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + stats->mapped_total = + (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + stats->unmapped_total = + (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + stats->huge_alloc = + (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; +#endif +#if ENABLE_GLOBAL_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + global_cache_t *cache = &_memory_span_cache[iclass]; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + uint32_t count = cache->count; +#if ENABLE_UNLIMITED_CACHE + span_t *current_span = cache->overflow; + while (current_span) { + ++count; + current_span = current_span->next; + } +#endif + atomic_store32_release(&cache->lock, 0); + stats->cached += count * (iclass + 1) * _memory_span_size; + } +#endif +} + +#if ENABLE_STATISTICS + +static void _memory_heap_dump_statistics(heap_t *heap, void *file) { + fprintf(file, "Heap %d stats:\n", heap->id); + fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize " + "BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB " + "FromCacheMiB FromReserveMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) + continue; + fprintf( + file, + "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu " + "%9u\n", + (uint32_t)iclass, + atomic_load32(&heap->size_class_use[iclass].alloc_current), + heap->size_class_use[iclass].alloc_peak, + atomic_load32(&heap->size_class_use[iclass].alloc_total), + atomic_load32(&heap->size_class_use[iclass].free_total), + _memory_size_class[iclass].block_size, + _memory_size_class[iclass].block_count, + atomic_load32(&heap->size_class_use[iclass].spans_current), + heap->size_class_use[iclass].spans_peak, + ((size_t)heap->size_class_use[iclass].alloc_peak * + (size_t)_memory_size_class[iclass].block_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * + _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * + _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32( + &heap->size_class_use[iclass].spans_from_reserved) * + _memory_span_size) / + (size_t)(1024 * 1024), + atomic_load32(&heap->size_class_use[iclass].spans_map_calls)); + } + fprintf(file, "Spans Current Peak Deferred PeakMiB Cached ToCacheMiB " + "FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB " + "FromGlobalMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && + !atomic_load32(&heap->span_use[iclass].spans_map_calls)) + continue; + fprintf( + file, + "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", + (uint32_t)(iclass + 1), atomic_load32(&heap->span_use[iclass].current), + atomic_load32(&heap->span_use[iclass].high), + atomic_load32(&heap->span_use[iclass].spans_deferred), + ((size_t)atomic_load32(&heap->span_use[iclass].high) * + (size_t)_memory_span_size * (iclass + 1)) / + (size_t)(1024 * 1024), +#if ENABLE_THREAD_CACHE + (unsigned int)(!iclass ? heap->span_cache.count + : heap->span_large_cache[iclass - 1].count), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), +#else + 0, (size_t)0, (size_t)0, +#endif + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * + (size_t)_memory_span_size * (iclass + 1)) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * + (size_t)_memory_span_size * (iclass + 1)) / + (size_t)(1024 * 1024), + atomic_load32(&heap->span_use[iclass].spans_map_calls)); + } + fprintf(file, "Full spans: %zu\n", heap->full_span_count); + fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); + fprintf( + file, "%17zu %17zu\n", + (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), + (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024)); +} + +#endif + +void rpmalloc_dump_statistics(void *file) { +#if ENABLE_STATISTICS + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t *heap = _memory_heaps[list_idx]; + while (heap) { + int need_dump = 0; + for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); + ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) { + rpmalloc_assert( + !atomic_load32(&heap->size_class_use[iclass].free_total), + "Heap statistics counter mismatch"); + rpmalloc_assert( + !atomic_load32(&heap->size_class_use[iclass].spans_map_calls), + "Heap statistics counter mismatch"); + continue; + } + need_dump = 1; + } + for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); + ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && + !atomic_load32(&heap->span_use[iclass].spans_map_calls)) + continue; + need_dump = 1; + } + if (need_dump) + _memory_heap_dump_statistics(heap, file); + heap = heap->next_heap; + } + } + fprintf(file, "Global stats:\n"); + size_t huge_current = + (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; + fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); + fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), + huge_peak / (size_t)(1024 * 1024)); + +#if ENABLE_GLOBAL_CACHE + fprintf(file, "GlobalCacheMiB\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + global_cache_t *cache = _memory_span_cache + iclass; + size_t global_cache = (size_t)cache->count * iclass * _memory_span_size; + + size_t global_overflow_cache = 0; + span_t *span = cache->overflow; + while (span) { + global_overflow_cache += iclass * _memory_span_size; + span = span->next; + } + if (global_cache || global_overflow_cache || cache->insert_count || + cache->extract_count) + fprintf(file, + "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", + iclass + 1, global_cache / (size_t)(1024 * 1024), + global_overflow_cache / (size_t)(1024 * 1024), + cache->insert_count, cache->extract_count); + } +#endif + + size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + size_t mapped_os = + (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; + size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + size_t mapped_total = + (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + size_t unmapped_total = + (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + fprintf( + file, + "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n"); + fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n", + mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024), + mapped_peak / (size_t)(1024 * 1024), + mapped_total / (size_t)(1024 * 1024), + unmapped_total / (size_t)(1024 * 1024)); + + fprintf(file, "\n"); +#if 0 + int64_t allocated = atomic_load64(&_allocation_counter); + int64_t deallocated = atomic_load64(&_deallocation_counter); + fprintf(file, "Allocation count: %lli\n", allocated); + fprintf(file, "Deallocation count: %lli\n", deallocated); + fprintf(file, "Current allocations: %lli\n", (allocated - deallocated)); + fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans)); + fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans)); +#endif +#endif + (void)sizeof(file); +} + +#if RPMALLOC_FIRST_CLASS_HEAPS + +extern inline rpmalloc_heap_t *rpmalloc_heap_acquire(void) { + // Must be a pristine heap from newly mapped memory pages, or else memory + // blocks could already be allocated from the heap which would (wrongly) be + // released when heap is cleared with rpmalloc_heap_free_all(). Also heaps + // guaranteed to be pristine from the dedicated orphan list can be used. + heap_t *heap = _rpmalloc_heap_allocate(1); + rpmalloc_assume(heap != NULL); + heap->owner_thread = 0; + _rpmalloc_stat_inc(&_memory_active_heaps); + return heap; +} + +extern inline void rpmalloc_heap_release(rpmalloc_heap_t *heap) { + if (heap) + _rpmalloc_heap_release(heap, 1, 1); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_allocate(heap, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, + size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, size_t size) { + return rpmalloc_heap_aligned_calloc(heap, 0, num, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, + size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void *block = _rpmalloc_aligned_allocate(heap, alignment, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + return _rpmalloc_reallocate(heap, ptr, size, 0, flags); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_realloc(rpmalloc_heap_t *heap, void *ptr, + size_t alignment, size_t size, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags); +} + +extern inline void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr) { + (void)sizeof(heap); + _rpmalloc_deallocate(ptr); +} + +extern inline void rpmalloc_heap_free_all(rpmalloc_heap_t *heap) { + span_t *span; + span_t *next_span; + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + span = heap->size_class[iclass].partial_span; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->size_class[iclass].partial_span = 0; + span = heap->full_span[iclass]; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + + span = heap->size_class[iclass].cache; + if (span) + _rpmalloc_heap_cache_insert(heap, span); + heap->size_class[iclass].cache = 0; + } + memset(heap->size_class, 0, sizeof(heap->size_class)); + memset(heap->full_span, 0, sizeof(heap->full_span)); + + span = heap->large_huge_span; + while (span) { + next_span = span->next; + if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE)) + _rpmalloc_deallocate_huge(span); + else + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->large_huge_span = 0; + heap->full_span_count = 0; + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, + span_cache->count * (iclass + 1) * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, + span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, + span_cache->count); +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + +#if ENABLE_STATISTICS + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->size_class_use[iclass].alloc_current, 0); + atomic_store32(&heap->size_class_use[iclass].spans_current, 0); + } + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->span_use[iclass].current, 0); + } +#endif +} + +extern inline void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap) { + heap_t *prev_heap = get_thread_heap_raw(); + if (prev_heap != heap) { + set_thread_heap(heap); + if (prev_heap) + rpmalloc_heap_release(prev_heap); + } +} + +extern inline rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr) { + // Grab the span, and then the heap from the span + span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); + if (span) { + return span->heap; + } + return 0; +} + +#endif + +#if ENABLE_PRELOAD || ENABLE_OVERRIDE + +#include "malloc.c" + +#endif + +void rpmalloc_linker_reference(void) { (void)sizeof(_rpmalloc_initialized); } diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.h b/llvm/lib/Support/rpmalloc/rpmalloc.h index 3911c53b779b36c3e52fbbc177c2306b56f46ede..5b7fe1ff4286ba724939ed9b76e27d95f124e6b9 100644 --- a/llvm/lib/Support/rpmalloc/rpmalloc.h +++ b/llvm/lib/Support/rpmalloc/rpmalloc.h @@ -1,428 +1,428 @@ -//===---------------------- rpmalloc.h ------------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(__clang__) || defined(__GNUC__) -#define RPMALLOC_EXPORT __attribute__((visibility("default"))) -#define RPMALLOC_ALLOCATOR -#if (defined(__clang_major__) && (__clang_major__ < 4)) || \ - (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#else -#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) \ - __attribute__((alloc_size(count, size))) -#endif -#define RPMALLOC_CDECL -#elif defined(_MSC_VER) -#define RPMALLOC_EXPORT -#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#define RPMALLOC_CDECL __cdecl -#else -#define RPMALLOC_EXPORT -#define RPMALLOC_ALLOCATOR -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#define RPMALLOC_CDECL -#endif - -//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce -// a very small overhead due to some size calculations not being compile time -// constants -#ifndef RPMALLOC_CONFIGURABLE -#define RPMALLOC_CONFIGURABLE 0 -#endif - -//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* -//! functions). -// Will introduce a very small overhead to track fully allocated spans in heaps -#ifndef RPMALLOC_FIRST_CLASS_HEAPS -#define RPMALLOC_FIRST_CLASS_HEAPS 0 -#endif - -//! Flag to rpaligned_realloc to not preserve content in reallocation -#define RPMALLOC_NO_PRESERVE 1 -//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be -//! done in-place, -// in which case the original pointer is still valid (just like a call to -// realloc which failes to allocate a new block). -#define RPMALLOC_GROW_OR_FAIL 2 - -typedef struct rpmalloc_global_statistics_t { - //! Current amount of virtual memory mapped, all of which might not have been - //! committed (only if ENABLE_STATISTICS=1) - size_t mapped; - //! Peak amount of virtual memory mapped, all of which might not have been - //! committed (only if ENABLE_STATISTICS=1) - size_t mapped_peak; - //! Current amount of memory in global caches for small and medium sizes - //! (<32KiB) - size_t cached; - //! Current amount of memory allocated in huge allocations, i.e larger than - //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) - size_t huge_alloc; - //! Peak amount of memory allocated in huge allocations, i.e larger than - //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) - size_t huge_alloc_peak; - //! Total amount of memory mapped since initialization (only if - //! ENABLE_STATISTICS=1) - size_t mapped_total; - //! Total amount of memory unmapped since initialization (only if - //! ENABLE_STATISTICS=1) - size_t unmapped_total; -} rpmalloc_global_statistics_t; - -typedef struct rpmalloc_thread_statistics_t { - //! Current number of bytes available in thread size class caches for small - //! and medium sizes (<32KiB) - size_t sizecache; - //! Current number of bytes available in thread span caches for small and - //! medium sizes (<32KiB) - size_t spancache; - //! Total number of bytes transitioned from thread cache to global cache (only - //! if ENABLE_STATISTICS=1) - size_t thread_to_global; - //! Total number of bytes transitioned from global cache to thread cache (only - //! if ENABLE_STATISTICS=1) - size_t global_to_thread; - //! Per span count statistics (only if ENABLE_STATISTICS=1) - struct { - //! Currently used number of spans - size_t current; - //! High water mark of spans used - size_t peak; - //! Number of spans transitioned to global cache - size_t to_global; - //! Number of spans transitioned from global cache - size_t from_global; - //! Number of spans transitioned to thread cache - size_t to_cache; - //! Number of spans transitioned from thread cache - size_t from_cache; - //! Number of spans transitioned to reserved state - size_t to_reserved; - //! Number of spans transitioned from reserved state - size_t from_reserved; - //! Number of raw memory map calls (not hitting the reserve spans but - //! resulting in actual OS mmap calls) - size_t map_calls; - } span_use[64]; - //! Per size class statistics (only if ENABLE_STATISTICS=1) - struct { - //! Current number of allocations - size_t alloc_current; - //! Peak number of allocations - size_t alloc_peak; - //! Total number of allocations - size_t alloc_total; - //! Total number of frees - size_t free_total; - //! Number of spans transitioned to cache - size_t spans_to_cache; - //! Number of spans transitioned from cache - size_t spans_from_cache; - //! Number of spans transitioned from reserved state - size_t spans_from_reserved; - //! Number of raw memory map calls (not hitting the reserve spans but - //! resulting in actual OS mmap calls) - size_t map_calls; - } size_use[128]; -} rpmalloc_thread_statistics_t; - -typedef struct rpmalloc_config_t { - //! Map memory pages for the given number of bytes. The returned address MUST - //! be - // aligned to the rpmalloc span size, which will always be a power of two. - // Optionally the function can store an alignment offset in the offset - // variable in case it performs alignment and the returned pointer is offset - // from the actual start of the memory region due to this alignment. The - // alignment offset will be passed to the memory unmap function. The - // alignment offset MUST NOT be larger than 65535 (storable in an uint16_t), - // if it is you must use natural alignment to shift it into 16 bits. If you - // set a memory_map function, you must also set a memory_unmap function or - // else the default implementation will be used for both. This function must - // be thread safe, it can be called by multiple threads simultaneously. - void *(*memory_map)(size_t size, size_t *offset); - //! Unmap the memory pages starting at address and spanning the given number - //! of bytes. - // If release is set to non-zero, the unmap is for an entire span range as - // returned by a previous call to memory_map and that the entire range should - // be released. The release argument holds the size of the entire span range. - // If release is set to 0, the unmap is a partial decommit of a subset of the - // mapped memory range. If you set a memory_unmap function, you must also set - // a memory_map function or else the default implementation will be used for - // both. This function must be thread safe, it can be called by multiple - // threads simultaneously. - void (*memory_unmap)(void *address, size_t size, size_t offset, - size_t release); - //! Called when an assert fails, if asserts are enabled. Will use the standard - //! assert() - // if this is not set. - void (*error_callback)(const char *message); - //! Called when a call to map memory pages fails (out of memory). If this - //! callback is - // not set or returns zero the library will return a null pointer in the - // allocation call. If this callback returns non-zero the map call will be - // retried. The argument passed is the number of bytes that was requested in - // the map call. Only used if the default system memory map function is used - // (memory_map callback is not set). - int (*map_fail_callback)(size_t size); - //! Size of memory pages. The page size MUST be a power of two. All memory - //! mapping - // requests to memory_map will be made with size set to a multiple of the - // page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system - // page size is used. - size_t page_size; - //! Size of a span of memory blocks. MUST be a power of two, and in - //! [4096,262144] - // range (unless 0 - set to 0 to use the default span size). Used if - // RPMALLOC_CONFIGURABLE is defined to 1. - size_t span_size; - //! Number of spans to map at each request to map new virtual memory blocks. - //! This can - // be used to minimize the system call overhead at the cost of virtual memory - // address space. The extra mapped pages will not be written until actually - // used, so physical committed memory should not be affected in the default - // implementation. Will be aligned to a multiple of spans that match memory - // page size in case of huge pages. - size_t span_map_count; - //! Enable use of large/huge pages. If this flag is set to non-zero and page - //! size is - // zero, the allocator will try to enable huge pages and auto detect the - // configuration. If this is set to non-zero and page_size is also non-zero, - // the allocator will assume huge pages have been configured and enabled - // prior to initializing the allocator. For Windows, see - // https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support - // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt - int enable_huge_pages; - //! Respectively allocated pages and huge allocated pages names for systems - // supporting it to be able to distinguish among anonymous regions. - const char *page_name; - const char *huge_page_name; -} rpmalloc_config_t; - -//! Initialize allocator with default configuration -RPMALLOC_EXPORT int rpmalloc_initialize(void); - -//! Initialize allocator with given configuration -RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config); - -//! Get allocator configuration -RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void); - -//! Finalize allocator -RPMALLOC_EXPORT void rpmalloc_finalize(void); - -//! Initialize allocator for calling thread -RPMALLOC_EXPORT void rpmalloc_thread_initialize(void); - -//! Finalize allocator for calling thread -RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches); - -//! Perform deferred deallocations pending for the calling thread heap -RPMALLOC_EXPORT void rpmalloc_thread_collect(void); - -//! Query if allocator is initialized for calling thread -RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void); - -//! Get per-thread statistics -RPMALLOC_EXPORT void -rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats); - -//! Get global statistics -RPMALLOC_EXPORT void -rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats); - -//! Dump all statistics in human readable format to file (should be a FILE*) -RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file); - -//! Allocate a memory block of at least the given size -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); - -//! Free the given memory block -RPMALLOC_EXPORT void rpfree(void *ptr); - -//! Allocate a memory block of at least the given size and zero initialize it -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); - -//! Reallocate the given block to at least the given size -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Reallocate the given block to at least the given size and alignment, -// with optional control flags (see RPMALLOC_NO_PRESERVE). -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size and alignment, and zero -//! initialize it. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_calloc(size_t alignment, size_t num, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment, - size_t size); - -//! Query the usable size of the given memory block (from given pointer to the -//! end of block) -RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr); - -//! Dummy empty function for forcing linker symbol inclusion -RPMALLOC_EXPORT void rpmalloc_linker_reference(void); - -#if RPMALLOC_FIRST_CLASS_HEAPS - -//! Heap type -typedef struct heap_t rpmalloc_heap_t; - -//! Acquire a new heap. Will reuse existing released heaps or allocate memory -//! for a new heap -// if none available. Heap API is implemented with the strict assumption that -// only one single thread will call heap functions for a given heap at any -// given time, no functions are thread safe. -RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void); - -//! Release a heap (does NOT free the memory allocated by the heap, use -//! rpmalloc_heap_free_all before destroying the heap). -// Releasing a heap will enable it to be reused by other threads. Safe to pass -// a null pointer. -RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap); - -//! Allocate a memory block of at least the given size using the given heap. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size using the given heap. The -//! returned -// block will have the requested alignment. Alignment must be a power of two -// and a multiple of sizeof(void*), and should ideally be less than memory page -// size. A caveat of rpmalloc internals is that this must also be strictly less -// than the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Allocate a memory block of at least the given size using the given heap and -//! zero initialize it. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Allocate a memory block of at least the given size using the given heap and -//! zero initialize it. The returned -// block will have the requested alignment. Alignment must either be zero, or a -// power of two and a multiple of sizeof(void*), and should ideally be less -// than memory page size. A caveat of rpmalloc internals is that this must also -// be strictly less than the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, - size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Reallocate the given block to at least the given size. The memory block MUST -//! be allocated -// by the same heap given to this function. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Reallocate the given block to at least the given size. The memory block MUST -//! be allocated -// by the same heap given to this function. The returned block will have the -// requested alignment. Alignment must be either zero, or a power of two and a -// multiple of sizeof(void*), and should ideally be less than memory page size. -// A caveat of rpmalloc internals is that this must also be strictly less than -// the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc( - rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); - -//! Free the given memory block from the given heap. The memory block MUST be -//! allocated -// by the same heap given to this function. -RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr); - -//! Free all memory allocated by the heap -RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap); - -//! Set the given heap as the current heap for the calling thread. A heap MUST -//! only be current heap -// for a single thread, a heap can never be shared between multiple threads. -// The previous current heap for the calling thread is released to be reused by -// other threads. -RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap); - -//! Returns which heap the given pointer is allocated on -RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr); - -#endif - -#ifdef __cplusplus -} -#endif +//===---------------------- rpmalloc.h ------------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) || defined(__GNUC__) +#define RPMALLOC_EXPORT __attribute__((visibility("default"))) +#define RPMALLOC_ALLOCATOR +#if (defined(__clang_major__) && (__clang_major__ < 4)) || \ + (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#else +#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) \ + __attribute__((alloc_size(count, size))) +#endif +#define RPMALLOC_CDECL +#elif defined(_MSC_VER) +#define RPMALLOC_EXPORT +#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#define RPMALLOC_CDECL __cdecl +#else +#define RPMALLOC_EXPORT +#define RPMALLOC_ALLOCATOR +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#define RPMALLOC_CDECL +#endif + +//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce +// a very small overhead due to some size calculations not being compile time +// constants +#ifndef RPMALLOC_CONFIGURABLE +#define RPMALLOC_CONFIGURABLE 0 +#endif + +//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* +//! functions). +// Will introduce a very small overhead to track fully allocated spans in heaps +#ifndef RPMALLOC_FIRST_CLASS_HEAPS +#define RPMALLOC_FIRST_CLASS_HEAPS 0 +#endif + +//! Flag to rpaligned_realloc to not preserve content in reallocation +#define RPMALLOC_NO_PRESERVE 1 +//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be +//! done in-place, +// in which case the original pointer is still valid (just like a call to +// realloc which failes to allocate a new block). +#define RPMALLOC_GROW_OR_FAIL 2 + +typedef struct rpmalloc_global_statistics_t { + //! Current amount of virtual memory mapped, all of which might not have been + //! committed (only if ENABLE_STATISTICS=1) + size_t mapped; + //! Peak amount of virtual memory mapped, all of which might not have been + //! committed (only if ENABLE_STATISTICS=1) + size_t mapped_peak; + //! Current amount of memory in global caches for small and medium sizes + //! (<32KiB) + size_t cached; + //! Current amount of memory allocated in huge allocations, i.e larger than + //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc; + //! Peak amount of memory allocated in huge allocations, i.e larger than + //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc_peak; + //! Total amount of memory mapped since initialization (only if + //! ENABLE_STATISTICS=1) + size_t mapped_total; + //! Total amount of memory unmapped since initialization (only if + //! ENABLE_STATISTICS=1) + size_t unmapped_total; +} rpmalloc_global_statistics_t; + +typedef struct rpmalloc_thread_statistics_t { + //! Current number of bytes available in thread size class caches for small + //! and medium sizes (<32KiB) + size_t sizecache; + //! Current number of bytes available in thread span caches for small and + //! medium sizes (<32KiB) + size_t spancache; + //! Total number of bytes transitioned from thread cache to global cache (only + //! if ENABLE_STATISTICS=1) + size_t thread_to_global; + //! Total number of bytes transitioned from global cache to thread cache (only + //! if ENABLE_STATISTICS=1) + size_t global_to_thread; + //! Per span count statistics (only if ENABLE_STATISTICS=1) + struct { + //! Currently used number of spans + size_t current; + //! High water mark of spans used + size_t peak; + //! Number of spans transitioned to global cache + size_t to_global; + //! Number of spans transitioned from global cache + size_t from_global; + //! Number of spans transitioned to thread cache + size_t to_cache; + //! Number of spans transitioned from thread cache + size_t from_cache; + //! Number of spans transitioned to reserved state + size_t to_reserved; + //! Number of spans transitioned from reserved state + size_t from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but + //! resulting in actual OS mmap calls) + size_t map_calls; + } span_use[64]; + //! Per size class statistics (only if ENABLE_STATISTICS=1) + struct { + //! Current number of allocations + size_t alloc_current; + //! Peak number of allocations + size_t alloc_peak; + //! Total number of allocations + size_t alloc_total; + //! Total number of frees + size_t free_total; + //! Number of spans transitioned to cache + size_t spans_to_cache; + //! Number of spans transitioned from cache + size_t spans_from_cache; + //! Number of spans transitioned from reserved state + size_t spans_from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but + //! resulting in actual OS mmap calls) + size_t map_calls; + } size_use[128]; +} rpmalloc_thread_statistics_t; + +typedef struct rpmalloc_config_t { + //! Map memory pages for the given number of bytes. The returned address MUST + //! be + // aligned to the rpmalloc span size, which will always be a power of two. + // Optionally the function can store an alignment offset in the offset + // variable in case it performs alignment and the returned pointer is offset + // from the actual start of the memory region due to this alignment. The + // alignment offset will be passed to the memory unmap function. The + // alignment offset MUST NOT be larger than 65535 (storable in an uint16_t), + // if it is you must use natural alignment to shift it into 16 bits. If you + // set a memory_map function, you must also set a memory_unmap function or + // else the default implementation will be used for both. This function must + // be thread safe, it can be called by multiple threads simultaneously. + void *(*memory_map)(size_t size, size_t *offset); + //! Unmap the memory pages starting at address and spanning the given number + //! of bytes. + // If release is set to non-zero, the unmap is for an entire span range as + // returned by a previous call to memory_map and that the entire range should + // be released. The release argument holds the size of the entire span range. + // If release is set to 0, the unmap is a partial decommit of a subset of the + // mapped memory range. If you set a memory_unmap function, you must also set + // a memory_map function or else the default implementation will be used for + // both. This function must be thread safe, it can be called by multiple + // threads simultaneously. + void (*memory_unmap)(void *address, size_t size, size_t offset, + size_t release); + //! Called when an assert fails, if asserts are enabled. Will use the standard + //! assert() + // if this is not set. + void (*error_callback)(const char *message); + //! Called when a call to map memory pages fails (out of memory). If this + //! callback is + // not set or returns zero the library will return a null pointer in the + // allocation call. If this callback returns non-zero the map call will be + // retried. The argument passed is the number of bytes that was requested in + // the map call. Only used if the default system memory map function is used + // (memory_map callback is not set). + int (*map_fail_callback)(size_t size); + //! Size of memory pages. The page size MUST be a power of two. All memory + //! mapping + // requests to memory_map will be made with size set to a multiple of the + // page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system + // page size is used. + size_t page_size; + //! Size of a span of memory blocks. MUST be a power of two, and in + //! [4096,262144] + // range (unless 0 - set to 0 to use the default span size). Used if + // RPMALLOC_CONFIGURABLE is defined to 1. + size_t span_size; + //! Number of spans to map at each request to map new virtual memory blocks. + //! This can + // be used to minimize the system call overhead at the cost of virtual memory + // address space. The extra mapped pages will not be written until actually + // used, so physical committed memory should not be affected in the default + // implementation. Will be aligned to a multiple of spans that match memory + // page size in case of huge pages. + size_t span_map_count; + //! Enable use of large/huge pages. If this flag is set to non-zero and page + //! size is + // zero, the allocator will try to enable huge pages and auto detect the + // configuration. If this is set to non-zero and page_size is also non-zero, + // the allocator will assume huge pages have been configured and enabled + // prior to initializing the allocator. For Windows, see + // https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support + // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt + int enable_huge_pages; + //! Respectively allocated pages and huge allocated pages names for systems + // supporting it to be able to distinguish among anonymous regions. + const char *page_name; + const char *huge_page_name; +} rpmalloc_config_t; + +//! Initialize allocator with default configuration +RPMALLOC_EXPORT int rpmalloc_initialize(void); + +//! Initialize allocator with given configuration +RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config); + +//! Get allocator configuration +RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void); + +//! Finalize allocator +RPMALLOC_EXPORT void rpmalloc_finalize(void); + +//! Initialize allocator for calling thread +RPMALLOC_EXPORT void rpmalloc_thread_initialize(void); + +//! Finalize allocator for calling thread +RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches); + +//! Perform deferred deallocations pending for the calling thread heap +RPMALLOC_EXPORT void rpmalloc_thread_collect(void); + +//! Query if allocator is initialized for calling thread +RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void); + +//! Get per-thread statistics +RPMALLOC_EXPORT void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats); + +//! Get global statistics +RPMALLOC_EXPORT void +rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats); + +//! Dump all statistics in human readable format to file (should be a FILE*) +RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file); + +//! Allocate a memory block of at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); + +//! Free the given memory block +RPMALLOC_EXPORT void rpfree(void *ptr); + +//! Allocate a memory block of at least the given size and zero initialize it +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); + +//! Reallocate the given block to at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Reallocate the given block to at least the given size and alignment, +// with optional control flags (see RPMALLOC_NO_PRESERVE). +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment, and zero +//! initialize it. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_calloc(size_t alignment, size_t num, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment, + size_t size); + +//! Query the usable size of the given memory block (from given pointer to the +//! end of block) +RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr); + +//! Dummy empty function for forcing linker symbol inclusion +RPMALLOC_EXPORT void rpmalloc_linker_reference(void); + +#if RPMALLOC_FIRST_CLASS_HEAPS + +//! Heap type +typedef struct heap_t rpmalloc_heap_t; + +//! Acquire a new heap. Will reuse existing released heaps or allocate memory +//! for a new heap +// if none available. Heap API is implemented with the strict assumption that +// only one single thread will call heap functions for a given heap at any +// given time, no functions are thread safe. +RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void); + +//! Release a heap (does NOT free the memory allocated by the heap, use +//! rpmalloc_heap_free_all before destroying the heap). +// Releasing a heap will enable it to be reused by other threads. Safe to pass +// a null pointer. +RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap); + +//! Allocate a memory block of at least the given size using the given heap. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size using the given heap. The +//! returned +// block will have the requested alignment. Alignment must be a power of two +// and a multiple of sizeof(void*), and should ideally be less than memory page +// size. A caveat of rpmalloc internals is that this must also be strictly less +// than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size using the given heap and +//! zero initialize it. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size using the given heap and +//! zero initialize it. The returned +// block will have the requested alignment. Alignment must either be zero, or a +// power of two and a multiple of sizeof(void*), and should ideally be less +// than memory page size. A caveat of rpmalloc internals is that this must also +// be strictly less than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, + size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Reallocate the given block to at least the given size. The memory block MUST +//! be allocated +// by the same heap given to this function. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Reallocate the given block to at least the given size. The memory block MUST +//! be allocated +// by the same heap given to this function. The returned block will have the +// requested alignment. Alignment must be either zero, or a power of two and a +// multiple of sizeof(void*), and should ideally be less than memory page size. +// A caveat of rpmalloc internals is that this must also be strictly less than +// the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc( + rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); + +//! Free the given memory block from the given heap. The memory block MUST be +//! allocated +// by the same heap given to this function. +RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr); + +//! Free all memory allocated by the heap +RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap); + +//! Set the given heap as the current heap for the calling thread. A heap MUST +//! only be current heap +// for a single thread, a heap can never be shared between multiple threads. +// The previous current heap for the calling thread is released to be reused by +// other threads. +RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap); + +//! Returns which heap the given pointer is allocated on +RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr); + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/llvm/lib/Support/rpmalloc/rpnew.h b/llvm/lib/Support/rpmalloc/rpnew.h index d8303c6f95652fecaa187a1d89996008a93bcbbf..a18f0799d56d1f2a633897199d53b80f80c54090 100644 --- a/llvm/lib/Support/rpmalloc/rpnew.h +++ b/llvm/lib/Support/rpmalloc/rpnew.h @@ -1,113 +1,113 @@ -//===-------------------------- rpnew.h -----------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#ifdef __cplusplus - -#include -#include - -#ifndef __CRTDECL -#define __CRTDECL -#endif - -extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); } - -extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); } - -extern void *__CRTDECL operator new(std::size_t size) noexcept(false) { - return rpmalloc(size); -} - -extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) { - return rpmalloc(size); -} - -extern void *__CRTDECL operator new(std::size_t size, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpmalloc(size); -} - -extern void *__CRTDECL operator new[](std::size_t size, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpmalloc(size); -} - -#if (__cplusplus >= 201402L || _MSC_VER >= 1916) - -extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept { - (void)sizeof(size); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept { - (void)sizeof(size); - rpfree(p); -} - -#endif - -#if (__cplusplus > 201402L || defined(__cpp_aligned_new)) - -extern void __CRTDECL operator delete(void *p, - std::align_val_t align) noexcept { - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, - std::align_val_t align) noexcept { - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete(void *p, std::size_t size, - std::align_val_t align) noexcept { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, std::size_t size, - std::align_val_t align) noexcept { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -extern void *__CRTDECL operator new(std::size_t size, - std::align_val_t align) noexcept(false) { - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new[](std::size_t size, - std::align_val_t align) noexcept(false) { - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpaligned_alloc(static_cast(align), size); -} - -#endif - -#endif +//===-------------------------- rpnew.h -----------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#ifdef __cplusplus + +#include +#include + +#ifndef __CRTDECL +#define __CRTDECL +#endif + +extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); } + +extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); } + +extern void *__CRTDECL operator new(std::size_t size) noexcept(false) { + return rpmalloc(size); +} + +extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) { + return rpmalloc(size); +} + +extern void *__CRTDECL operator new(std::size_t size, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpmalloc(size); +} + +extern void *__CRTDECL operator new[](std::size_t size, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpmalloc(size); +} + +#if (__cplusplus >= 201402L || _MSC_VER >= 1916) + +extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept { + (void)sizeof(size); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept { + (void)sizeof(size); + rpfree(p); +} + +#endif + +#if (__cplusplus > 201402L || defined(__cpp_aligned_new)) + +extern void __CRTDECL operator delete(void *p, + std::align_val_t align) noexcept { + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, + std::align_val_t align) noexcept { + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete(void *p, std::size_t size, + std::align_val_t align) noexcept { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, std::size_t size, + std::align_val_t align) noexcept { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +extern void *__CRTDECL operator new(std::size_t size, + std::align_val_t align) noexcept(false) { + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new[](std::size_t size, + std::align_val_t align) noexcept(false) { + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpaligned_alloc(static_cast(align), size); +} + +#endif + +#endif diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 9669a393bc2b94d02e006cb02d2de73b8e21ead1..0301032e8497725e2072e2a4505e9f38f81115d4 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -795,7 +795,7 @@ INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass) INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) @@ -809,8 +809,8 @@ void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -937,7 +937,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { DomTree = &getAnalysis().getDomTree(); Loops = &getAnalysis().getLI(); MBPI = &getAnalysis().getMBPI(); - Traces = &getAnalysis(); + Traces = &getAnalysis().getMTM(); MinInstr = nullptr; MinSize = MF.getFunction().hasMinSize(); diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 97671bc59f6b9e1467647a2aead9e2c9637b9550..831f311b2364415e4c7e3b253f4823b5ab63ef31 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -522,6 +522,39 @@ def FeatureTLBIW : ExtensionWithMArch<"tlbiw", "TLBIW", "FEAT_TLBIW", // Armv9.6 Architecture Extensions //===----------------------------------------------------------------------===// +def FeatureCMPBR : ExtensionWithMArch<"cmpbr", "CMPBR", "FEAT_CMPBR", + "Enable Armv9.6-A base compare and branch instructions">; + +def FeatureF8F32MM: ExtensionWithMArch<"f8f32mm", "F8F32MM", "FEAT_F8F32MM", + "Enable Armv9.6-A FP8 to Single-Precision Matrix Multiplication">; + +def FeatureF8F16MM: ExtensionWithMArch<"f8f16mm", "F8F16MM", "FEAT_F8F16MM", + "Enable Armv9.6-A FP8 to Half-Precision Matrix Multiplication">; + +def FeatureFPRCVT: ExtensionWithMArch<"fprcvt", "FPRCVT", "FEAT_FPRCVT", + "Enable Armv9.6-A base convert instructions for SIMD&FP scalar register operands of" + " different input and output sizes">; + +def FeatureLSFE : ExtensionWithMArch<"lsfe", "LSFE", "FEAT_LSFE", + "Enable Armv9.6-A base Atomic floating-point in-memory instructions">; + +def FeatureSME2p2: ExtensionWithMArch<"sme2p2", "SME2p2", "FEAT_SME2p2", + "Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions", [FeatureSME2p1]>; + +def FeatureSSVE_AES : ExtensionWithMArch<"ssve-aes", "SSVE_AES", "FEAT_SSVE_AES", + "Enable Armv9.6-A SVE2 AES support in streaming SVE mode">; + +def FeatureSVE2p2 : ExtensionWithMArch<"sve2p2", "SVE2p2", "FEAT_SVE2p2", + "Enable Armv9.6-A Scalable Vector Extension 2.2 instructions", [FeatureSVE2p1]>; + +def FeatureSVEAES2: ExtensionWithMArch<"sve-aes2", "SVE_AES2", "FEAT_SVE_AES2", + "Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions">; + +def FeatureSVEBFSCALE: ExtensionWithMArch<"sve-bfscale", "SVE_BFSCALE", "FEAT_SVE_BFSCALE", + "Enable Armv9.6-A SVE BFloat16 scaling instructions">; + +def FeatureSVE_F16F32MM: ExtensionWithMArch<"sve-f16f32mm", "SVE_F16F32MM", "FEAT_SVE_F16F32MM", + "Enable Armv9.6-A FP16 to FP32 Matrix Multiply instructions">; //===----------------------------------------------------------------------===// // Other Features @@ -833,8 +866,8 @@ def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a", [HasV9_4aOps, FeatureCPA], !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA, FeatureLUT, FeatureFAMINMAX])>; def HasV9_6aOps : Architecture64<9, 6, "a", "v9.6a", - [HasV9_5aOps], - !listconcat(HasV9_5aOps.DefaultExts, [])>; + [HasV9_5aOps, FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2], + !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2])>; def HasV8_0rOps : Architecture64<8, 0, "r", "v8r", [ //v8.1 FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2, diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 1b8eac7fac21f761ca55f1021919e4d5643e5e20..bbf2f2677954577335932d45e7c8e205fa9712c3 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -480,9 +480,9 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { getSVEStackSize(MF) || LowerQRegCopyThroughMem); } -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register. -bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { +/// hasFPImpl - Return true if the specified function should have a dedicated +/// frame pointer register. +bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index c1973124962085024654104b8ff2db140f8a726f..20445e63bcb13ee18e886fe952bca6878012cab1 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -65,7 +65,6 @@ public: /// Can this function use the red zone for local allocations. bool canUseRedZone(const MachineFunction &MF) const; - bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; bool assignCalleeSavedSpillSlots(MachineFunction &MF, @@ -125,6 +124,9 @@ public: orderFrameObjects(const MachineFunction &MF, SmallVectorImpl &ObjectsToAllocate) const override; +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: /// Returns true if a homogeneous prolog or epilog code can be emitted /// for the size optimization. If so, HOM_Prolog/HOM_Epilog pseudo diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ed06d8a5d630136bb73402fb1bb99e2e68357813..a44a73eb2c0fda811b1046d92c9a637d7a82a181 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2400,10 +2400,11 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( } case AArch64ISD::BICi: { // Compute the bit cleared value. - uint64_t Mask = - ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); + APInt Mask = + ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2)) + .trunc(Known.getBitWidth()); Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); - Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); + Known &= KnownBits::makeConstant(Mask); break; } case AArch64ISD::VLSHR: { @@ -12839,7 +12840,8 @@ static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, // Benefit form APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); - APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false, + /*implicitTrunc=*/true); // The following shuffle indices must be the successive elements after the // first real element. bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) { @@ -14306,9 +14308,9 @@ static SDValue NormalizeBuildVector(SDValue Op, // (with operands cast to integers), then the only possibilities // are constants and UNDEFs. if (auto *CstLane = dyn_cast(Lane)) { - APInt LowBits(EltTy.getSizeInBits(), - CstLane->getZExtValue()); - Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); + Lane = DAG.getConstant( + CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(), + dl, MVT::i32); } else if (Lane.getNode()->isUndef()) { Lane = DAG.getUNDEF(MVT::i32); } else { @@ -16455,10 +16457,9 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); if (Parts.size() == 4) { - auto *F = Intrinsic::getOrInsertDeclaration( - TI->getModule(), Intrinsic::aarch64_neon_tbl4, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); - Results.push_back(Builder.CreateCall(F, Parts)); + Results.push_back( + Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts)); Parts.clear(); } @@ -16485,9 +16486,8 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { break; } - auto *F = Intrinsic::getOrInsertDeclaration(TI->getModule(), TblID, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); - Results.push_back(Builder.CreateCall(F, Parts)); + Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts)); } // Extract the destination vector from TBL result(s) after combining them @@ -23713,7 +23713,7 @@ static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); // Stride does not scale explicitly by 'Scale', because it happens in // the gather/scatter addressing mode. - Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride)); + Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true)); return true; } @@ -26982,9 +26982,9 @@ void AArch64TargetLowering::ReplaceNodeResults( } } -bool AArch64TargetLowering::useLoadStackGuardNode() const { +bool AArch64TargetLowering::useLoadStackGuardNode(const Module &M) const { if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) - return TargetLowering::useLoadStackGuardNode(); + return TargetLowering::useLoadStackGuardNode(M); return true; } @@ -27250,9 +27250,9 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, if (ValueTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; - Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int); - Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); + Value *LoHi = + Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); @@ -27269,11 +27269,10 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; - Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); - CallInst *CI = Builder.CreateCall(Ldxr, Addr); + CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr); CI->addParamAttr(0, Attribute::get(Builder.getContext(), Attribute::ElementType, IntEltTy)); Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); @@ -27283,9 +27282,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilderBase &Builder) const { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_clrex)); + Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {}); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, @@ -28729,7 +28726,7 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits(); unsigned IndexLen = MinSVESize / BitsPerElt; unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements(); - uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue(); + uint64_t MaxOffset = maxUIntN(BitsPerElt); EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger(); EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen); bool MinMaxEqual = (MinSVESize == MaxSVESize); @@ -29087,16 +29084,14 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( KnownBits KnownOp0 = TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1); // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2)) - uint64_t BitsToClear = Op->getConstantOperandVal(1) - << Op->getConstantOperandVal(2); + APInt BitsToClear = + (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2)) + .trunc(KnownOp0.getBitWidth()); APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero; - if (APInt(Known.getBitWidth(), BitsToClear) - .isSubsetOf(AlreadyZeroedBitsToClear)) + if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear)) return TLO.CombineTo(Op, Op0); - Known = KnownOp0 & - KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear)); - + Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear); return false; } case ISD::INTRINSIC_WO_CHAIN: { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index cf2ae5fd027c7a3cc49358fd500cc5cf6ea666f9..217e971568a999bfb24bc1f1fa105f2a337fc83a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -811,7 +811,7 @@ public: TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; - bool useLoadStackGuardNode() const override; + bool useLoadStackGuardNode(const Module &M) const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 325508b62a9f14eb2dc0e316fec072c271d46607..6c9f0986b9e349e775f83bfdc5898cabf9a944ce 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -213,12 +213,35 @@ def HasSMEF8F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8 def HasSMEF8F32 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">, AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">; +def HasCMPBR : Predicate<"Subtarget->hasCMPBR()">, + AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">; +def HasF8F32MM : Predicate<"Subtarget->hasF8F32MM()">, + AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">; +def HasF8F16MM : Predicate<"Subtarget->hasF8F16MM()">, + AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">; +def HasFPRCVT : Predicate<"Subtarget->hasFPRCVT()">, + AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">; +def HasLSFE : Predicate<"Subtarget->hasLSFE()">, + AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">; +def HasSME2p2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">, + AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">; +def HasSVEAES2 : Predicate<"Subtarget->hasSVEAES2()">, + AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">; +def HasSVEBFSCALE : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">, + AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">; +def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">, + AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">; // A subset of SVE(2) instructions are legal in Streaming SVE execution mode, // they should be enabled if either has been specified. def HasSVEorSME : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME), "sve or sme">; +def HasSVEorSME2p2 + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, + AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME2p2), + "sve or sme2p2">; def HasSVE2orSME : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME), @@ -227,6 +250,10 @@ def HasSVE2orSME2 : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">, AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2), "sve2 or sme2">; +def HasSVE2orSSVE_AES + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_AES), "sve2 or ssve-aes">; def HasSVE2p1_or_HasSME : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">; @@ -236,7 +263,13 @@ def HasSVE2p1_or_HasSME2 def HasSVE2p1_or_HasSME2p1 : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">, AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">; - +def HasSVE2p2orSME2p2 + : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">, + AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2), "sme2p2 or sve2p2">; +def HasSVE2p1orSSVE_AES + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2p1, FeatureSSVE_AES), "sve2p1 or ssve-aes">; def HasSMEF16F16orSMEF8F16 : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">, AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16), @@ -6237,7 +6270,8 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), // Some float -> int -> float conversion patterns for which we want to keep the // int values in FP registers using the corresponding NEON instructions to // avoid more costly int <-> fp register transfers. -let Predicates = [HasNEONandIsStreamingSafe] in { +// TODO: Allow these in streaming[-compatible] functions with +sme2p2. +let Predicates = [HasNEON] in { def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))), (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>; def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))), @@ -6247,7 +6281,8 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))), def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))), (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>; -let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { +// TODO: Allow these in streaming[-compatible] functions with +sme2p2. +let Predicates = [HasNEON, HasFullFP16] in { def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))), (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>; def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), @@ -6270,9 +6305,10 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))), // fp16: integer extraction from vector must be at least 32-bits to be legal. // Actual extraction result is then an in-reg sign-extension of lower 16-bits. -let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { -def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract - (v8i16 FPR128:$Rn), (i64 0))), i16)))), +// TODO: Allow these in streaming[-compatible] functions with +sme2p2. +let Predicates = [HasNEON, HasFullFP16] in { +def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract + (v8i16 FPR128:$Rn), (i64 0))), i16)))), (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>; // unsigned 32-bit extracted element is truncated to 16-bits using AND @@ -6367,7 +6403,7 @@ def : Pat <(f64 (uint_to_fp (i32 (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>; // 64-bits -> double are handled in target specific dag combine: // performIntToFpCombine. -} // let Predicates = [HasNEONandIsStreamingSafe] +} // let Predicates = [HasNEON] //===----------------------------------------------------------------------===// // Advanced SIMD three different-sized vector instructions. diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 72f110cebbdc8f1cc242f7b4416e2b5b483d8782..85b9733e95c5293925d0c03c2cc58a2a8eebdce2 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -303,7 +303,7 @@ public: void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; } uint64_t getLocalStackSize() const { return LocalStackSize; } - void setOutliningStyle(std::string Style) { OutliningStyle = Style; } + void setOutliningStyle(const std::string &Style) { OutliningStyle = Style; } std::optional getOutliningStyle() const { return OutliningStyle; } diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index fe96fedcfb82dcda86d2c6e3a8310bb1f4b005f2..a6535a532fff3f49c92f555f921a73dc9f5f0e99 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -481,10 +481,9 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( assert(PrologueBB); IRBuilder<> IRB(&PrologueBB->front()); - Function *IRG_SP = Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::aarch64_irg_sp); Instruction *Base = - IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())}); + IRB.CreateIntrinsic(Intrinsic::aarch64_irg_sp, {}, + {Constant::getNullValue(IRB.getInt64Ty())}); Base->setName("basetag"); auto TargetTriple = Triple(M.getTargetTriple()); // This ABI will make it into Android API level 35. @@ -580,11 +579,10 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { NextTag = (NextTag + 1) % 16; // Replace alloca with tagp(alloca). IRBuilder<> IRB(Info.AI->getNextNode()); - Function *TagP = Intrinsic::getOrInsertDeclaration( - F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()}); Instruction *TagPCall = - IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base, - ConstantInt::get(IRB.getInt64Ty(), Tag)}); + IRB.CreateIntrinsic(Intrinsic::aarch64_tagp, {Info.AI->getType()}, + {Constant::getNullValue(Info.AI->getType()), Base, + ConstantInt::get(IRB.getInt64Ty(), Tag)}); if (Info.AI->hasName()) TagPCall->setName(Info.AI->getName() + ".tag"); // Does not replace metadata, so we don't have to handle DbgVariableRecords. diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 047e38261a6fcd4c6128cd4666d2a1c209aa2836..d8c8b17565abb92b17d7c6d82bd5cece90b5b185 100644 --- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -53,8 +53,8 @@ private: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -139,7 +139,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { TRI = ST.getRegisterInfo(); MRI = &MF.getRegInfo(); SchedModel.init(&ST); - Traces = &getAnalysis(); + Traces = &getAnalysis().getMTM(); MinInstr = nullptr; LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n'); diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 7b0ae23358673e8f0eaeca35b9a8264eb17debb6..c7bd0390b65620b31bd588ac1a6be5d4cb0620ac 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -290,15 +290,19 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) { if (TT.isOSBinFormatMachO()) { if (TT.getArch() == Triple::aarch64_32) - return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32"; - return "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"; + return "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-" + "n32:64-S128-Fn32"; + return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-" + "Fn32"; } if (TT.isOSBinFormatCOFF()) - return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"; + return "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:" + "128-n32:64-S128-Fn32"; std::string Endian = LittleEndian ? "e" : "E"; std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : ""; return Endian + "-m:e" + Ptr32 + - "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"; + "-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-" + "n32:64-S128-Fn32"; } static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) { @@ -780,7 +784,7 @@ bool AArch64PassConfig::addILPOpts() { if (EnableCondBrTuning) addPass(createAArch64CondBrTuning()); if (EnableEarlyIfConversion) - addPass(&EarlyIfConverterID); + addPass(&EarlyIfConverterLegacyID); if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); addPass(createAArch64SIMDInstrOptPass()); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0227532e94c71150a3ca94ca8a6603ac576d7384..7c6b789b9c1b725d1cb924cdb935a41a5dca31fe 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -66,6 +66,10 @@ static cl::opt BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction")); +static cl::opt DMBLookaheadThreshold( + "dmb-lookahead-threshold", cl::init(10), cl::Hidden, + cl::desc("The number of instructions to search for a redundant dmb")); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -333,8 +337,10 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const { assert(K != TargetTransformInfo::RGK_Scalar); - return (K == TargetTransformInfo::RGK_FixedWidthVector && - ST->isNeonAvailable()); + return ((K == TargetTransformInfo::RGK_FixedWidthVector && + ST->isNeonAvailable()) || + (K == TargetTransformInfo::RGK_ScalableVector && + ST->isSVEorStreamingSVEAvailable())); } /// Calculate the cost of materializing a 64-bit value. This helper @@ -2150,6 +2156,31 @@ static std::optional instCombineSVEInsr(InstCombiner &IC, return std::nullopt; } +static std::optional instCombineDMB(InstCombiner &IC, + IntrinsicInst &II) { + // If this barrier is post-dominated by identical one we can remove it + auto *NI = II.getNextNonDebugInstruction(); + unsigned LookaheadThreshold = DMBLookaheadThreshold; + auto CanSkipOver = [](Instruction *I) { + return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects(); + }; + while (LookaheadThreshold-- && CanSkipOver(NI)) { + auto *NIBB = NI->getParent(); + NI = NI->getNextNonDebugInstruction(); + if (!NI) { + if (auto *SuccBB = NIBB->getUniqueSuccessor()) + NI = SuccBB->getFirstNonPHIOrDbgOrLifetime(); + else + break; + } + } + auto *NextII = dyn_cast_or_null(NI); + if (NextII && II.isIdenticalTo(NextII)) + return IC.eraseInstFromFunction(II); + + return std::nullopt; +} + std::optional AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { @@ -2157,6 +2188,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, switch (IID) { default: break; + case Intrinsic::aarch64_dmb: + return instCombineDMB(IC, II); case Intrinsic::aarch64_sve_fcvt_bf16f32_v2: case Intrinsic::aarch64_sve_fcvt_f16f32: case Intrinsic::aarch64_sve_fcvt_f16f64: diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index df69c20b1359fc2044fda5af5642b8b809e57cbb..a5165d45893f3e8ac8727b0ab69cbf3dfa2d9f78 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -3736,6 +3736,17 @@ static const struct Extension { {"sme-fa64", {AArch64::FeatureSMEFA64}}, {"cpa", {AArch64::FeatureCPA}}, {"tlbiw", {AArch64::FeatureTLBIW}}, + {"cmpbr", {AArch64::FeatureCMPBR}}, + {"f8f32mm", {AArch64::FeatureF8F32MM}}, + {"f8f16mm", {AArch64::FeatureF8F16MM}}, + {"fprcvt", {AArch64::FeatureFPRCVT}}, + {"lsfe", {AArch64::FeatureLSFE}}, + {"sme2p2", {AArch64::FeatureSME2p2}}, + {"ssve-aes", {AArch64::FeatureSSVE_AES}}, + {"sve2p2", {AArch64::FeatureSVE2p2}}, + {"sve-aes2", {AArch64::FeatureSVEAES2}}, + {"sve-bfscale", {AArch64::FeatureSVEBFSCALE}}, + {"sve-f16f32mm", {AArch64::FeatureSVE_F16F32MM}}, }; static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index a69894839361bc19ca4e9ed99e1841022767fe40..e9d01602c298afbc9bc1904ff38e7e98a2724cc5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -215,19 +215,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor({s64, v8s16, v16s8, v4s32}) .lower(); - auto &MinMaxActions = getActionDefinitionsBuilder( - {G_SMIN, G_SMAX, G_UMIN, G_UMAX}); - if (HasCSSC) - MinMaxActions - .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) - // Making clamping conditional on CSSC extension as without legal types we - // lower to CMP which can fold one of the two sxtb's we'd otherwise need - // if we detect a type smaller than 32-bit. - .minScalar(0, s32); - else - MinMaxActions - .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}); - MinMaxActions + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + .legalFor(HasCSSC, {s32, s64}) + .minScalar(HasCSSC, 0, s32) .clampNumElements(0, v8s8, v16s8) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) @@ -247,11 +238,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM, G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) - .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64}) - .legalIf([=](const LegalityQuery &Query) { - const auto &Ty = Query.Types[0]; - return (Ty == v8s16 || Ty == v4s16) && HasFP16; - }) + .legalFor({s32, s64, v2s32, v4s32, v2s64}) + .legalFor(HasFP16, {s16, v4s16, v8s16}) .libcallFor({s128}) .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .minScalarOrElt(0, MinFPScalar) @@ -261,11 +249,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .moreElementsToNextPow2(0); getActionDefinitionsBuilder({G_FABS, G_FNEG}) - .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64}) - .legalIf([=](const LegalityQuery &Query) { - const auto &Ty = Query.Types[0]; - return (Ty == v8s16 || Ty == v4s16) && HasFP16; - }) + .legalFor({s32, s64, v2s32, v4s32, v2s64}) + .legalFor(HasFP16, {s16, v4s16, v8s16}) .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .lowerIf(scalarOrEltWiderThan(0, 64)) .clampNumElements(0, v4s16, v8s16) @@ -350,31 +335,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0; }; - auto &LoadActions = getActionDefinitionsBuilder(G_LOAD); - auto &StoreActions = getActionDefinitionsBuilder(G_STORE); - - if (ST.hasSVE()) { - LoadActions.legalForTypesWithMemDesc({ - // 128 bit base sizes - {nxv16s8, p0, nxv16s8, 8}, - {nxv8s16, p0, nxv8s16, 8}, - {nxv4s32, p0, nxv4s32, 8}, - {nxv2s64, p0, nxv2s64, 8}, - }); - - // TODO: Add nxv2p0. Consider bitcastIf. - // See #92130 - // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461 - StoreActions.legalForTypesWithMemDesc({ - // 128 bit base sizes - {nxv16s8, p0, nxv16s8, 8}, - {nxv8s16, p0, nxv8s16, 8}, - {nxv4s32, p0, nxv4s32, 8}, - {nxv2s64, p0, nxv2s64, 8}, - }); - } - - LoadActions + getActionDefinitionsBuilder(G_LOAD) .customIf([=](const LegalityQuery &Query) { return HasRCPC3 && Query.Types[0] == s128 && Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire; @@ -399,6 +360,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // These extends are also legal .legalForTypesWithMemDesc( {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}}) + .legalForTypesWithMemDesc({ + // SVE vscale x 128 bit base sizes + {nxv16s8, p0, nxv16s8, 8}, + {nxv8s16, p0, nxv8s16, 8}, + {nxv4s32, p0, nxv4s32, 8}, + {nxv2s64, p0, nxv2s64, 8}, + }) .widenScalarToNextPow2(0, /* MinSize = */ 8) .clampMaxNumElements(0, s8, 16) .clampMaxNumElements(0, s16, 8) @@ -425,7 +393,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0) .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); - StoreActions + getActionDefinitionsBuilder(G_STORE) .customIf([=](const LegalityQuery &Query) { return HasRCPC3 && Query.Types[0] == s128 && Query.MMODescrs[0].Ordering == AtomicOrdering::Release; @@ -445,6 +413,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8}, {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8}, {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}}) + .legalForTypesWithMemDesc({ + // SVE vscale x 128 bit base sizes + // TODO: Add nxv2p0. Consider bitcastIf. + // See #92130 + // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461 + {nxv16s8, p0, nxv16s8, 8}, + {nxv8s16, p0, nxv8s16, 8}, + {nxv4s32, p0, nxv4s32, 8}, + {nxv2s64, p0, nxv2s64, 8}, + }) .clampScalar(0, s8, s64) .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].isScalar() && @@ -532,12 +510,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(0) .clampScalar(0, s8, s64); getActionDefinitionsBuilder(G_FCONSTANT) - .legalIf([=](const LegalityQuery &Query) { - const auto &Ty = Query.Types[0]; - if (HasFP16 && Ty == s16) - return true; - return Ty == s32 || Ty == s64 || Ty == s128; - }) + .legalFor({s32, s64, s128}) + .legalFor(HasFP16, {s16}) .clampScalar(0, MinFPScalar, s128); // FIXME: fix moreElementsToNextPow2 @@ -569,16 +543,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .customIf(isVector(0)); getActionDefinitionsBuilder(G_FCMP) - .legalFor({{s32, MinFPScalar}, - {s32, s32}, + .legalFor({{s32, s32}, {s32, s64}, {v4s32, v4s32}, {v2s32, v2s32}, {v2s64, v2s64}}) - .legalIf([=](const LegalityQuery &Query) { - const auto &Ty = Query.Types[1]; - return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16; - }) + .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) .widenScalarOrEltToNextPow2(1) .clampScalar(0, s32, s32) .minScalarOrElt(1, MinFPScalar) @@ -693,13 +663,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {v2s64, v2s64}, {v4s32, v4s32}, {v2s32, v2s32}}) - .legalIf([=](const LegalityQuery &Query) { - return HasFP16 && - (Query.Types[1] == s16 || Query.Types[1] == v4s16 || - Query.Types[1] == v8s16) && - (Query.Types[0] == s32 || Query.Types[0] == s64 || - Query.Types[0] == v4s16 || Query.Types[0] == v8s16); - }) + .legalFor(HasFP16, + {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) // The range of a fp16 value fits into an i17, so we can lower the width @@ -741,13 +706,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {v2s64, v2s64}, {v4s32, v4s32}, {v2s32, v2s32}}) - .legalIf([=](const LegalityQuery &Query) { - return HasFP16 && - (Query.Types[1] == s16 || Query.Types[1] == v4s16 || - Query.Types[1] == v8s16) && - (Query.Types[0] == s32 || Query.Types[0] == s64 || - Query.Types[0] == v4s16 || Query.Types[0] == v8s16); - }) + .legalFor(HasFP16, + {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) // Handle types larger than i64 by scalarizing/lowering. .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) @@ -790,13 +750,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {v2s64, v2s64}, {v4s32, v4s32}, {v2s32, v2s32}}) - .legalIf([=](const LegalityQuery &Query) { - return HasFP16 && - (Query.Types[0] == s16 || Query.Types[0] == v4s16 || - Query.Types[0] == v8s16) && - (Query.Types[1] == s32 || Query.Types[1] == s64 || - Query.Types[1] == v4s16 || Query.Types[1] == v8s16); - }) + .legalFor(HasFP16, + {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}}) .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .moreElementsToNextPow2(1) @@ -893,29 +848,21 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .lowerIf( all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); - LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) { - return ST.outlineAtomics() && !ST.hasLSE(); - }; + bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE(); getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) - .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), - predNot(UseOutlineAtomics))) - .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics))) - .customIf([UseOutlineAtomics](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() == 128 && - !UseOutlineAtomics(Query); - }) - .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0), - UseOutlineAtomics)) + .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}}) + .customFor(!UseOutlineAtomics, {{s128, p0}}) + .libcallFor(UseOutlineAtomics, + {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}}) .clampScalar(0, s32, s64); getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR}) - .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), - predNot(UseOutlineAtomics))) - .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), - UseOutlineAtomics)) + .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}}) + .libcallFor(UseOutlineAtomics, + {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}}) .clampScalar(0, s32, s64); // Do not outline these atomics operations, as per comment in @@ -1050,12 +997,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(1, /*Min=*/32) .clampScalar(1, s32, s64) .scalarSameSizeAs(0, 1) - .legalIf([=](const LegalityQuery &Query) { - return (HasCSSC && typeInSet(0, {s32, s64})(Query)); - }) - .customIf([=](const LegalityQuery &Query) { - return (!HasCSSC && typeInSet(0, {s32, s64})(Query)); - }); + .legalFor(HasCSSC, {s32, s64}) + .customFor(!HasCSSC, {s32, s64}); getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) .legalIf([=](const LegalityQuery &Query) { @@ -1143,11 +1086,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) } // FIXME: Legal vector types are only legal with NEON. - auto &ABSActions = getActionDefinitionsBuilder(G_ABS); - if (HasCSSC) - ABSActions - .legalFor({s32, s64}); - ABSActions.legalFor(PackedVectorAllTypeList) + getActionDefinitionsBuilder(G_ABS) + .legalFor(HasCSSC, {s32, s64}) + .legalFor(PackedVectorAllTypeList) .customIf([=](const LegalityQuery &Q) { // TODO: Fix suboptimal codegen for 128+ bit types. LLT SrcTy = Q.Types[0]; @@ -1171,10 +1112,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // later. getActionDefinitionsBuilder(G_VECREDUCE_FADD) .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}}) - .legalIf([=](const LegalityQuery &Query) { - const auto &Ty = Query.Types[1]; - return (Ty == v4s16 || Ty == v8s16) && HasFP16; - }) + .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}}) .minScalarOrElt(0, MinFPScalar) .clampMaxNumElements(1, s64, 2) .clampMaxNumElements(1, s32, 4) @@ -1215,10 +1153,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM}) .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) - .legalIf([=](const LegalityQuery &Query) { - const auto &Ty = Query.Types[1]; - return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16; - }) + .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}}) .minScalarOrElt(0, MinFPScalar) .clampMaxNumElements(1, s64, 2) .clampMaxNumElements(1, s32, 4) @@ -1295,32 +1230,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .customFor({{s32, s32}, {s64, s64}}); auto always = [=](const LegalityQuery &Q) { return true; }; - auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP); - if (HasCSSC) - CTPOPActions - .legalFor({{s32, s32}, - {s64, s64}, - {v8s8, v8s8}, - {v16s8, v16s8}}) - .customFor({{s128, s128}, - {v2s64, v2s64}, - {v2s32, v2s32}, - {v4s32, v4s32}, - {v4s16, v4s16}, - {v8s16, v8s16}}); - else - CTPOPActions - .legalFor({{v8s8, v8s8}, - {v16s8, v16s8}}) - .customFor({{s32, s32}, - {s64, s64}, - {s128, s128}, - {v2s64, v2s64}, - {v2s32, v2s32}, - {v4s32, v4s32}, - {v4s16, v4s16}, - {v8s16, v8s16}}); - CTPOPActions + getActionDefinitionsBuilder(G_CTPOP) + .legalFor(HasCSSC, {{s32, s32}, {s64, s64}}) + .legalFor({{v8s8, v8s8}, {v16s8, v16s8}}) + .customFor(!HasCSSC, {{s32, s32}, {s64, s64}}) + .customFor({{s128, s128}, + {v2s64, v2s64}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v4s16, v4s16}, + {v8s16, v8s16}}) .clampScalar(0, s32, s128) .widenScalarToNextPow2(0) .minScalarEltSameAsIf(always, 1, 0) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index ea88ed424dc597e55f67e70efebeea3088a46829..ee5e75955cd459f89a2069695de85bdd756e0b78 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -407,16 +407,14 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, Value *const Identity) const { Type *AtomicTy = V->getType(); Module *M = B.GetInsertBlock()->getModule(); - Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::amdgcn_update_dpp, AtomicTy); // Reduce within each row of 16 lanes. for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), - B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, AtomicTy, + {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); } // Reduce within each pair of rows (i.e. 32 lanes). diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 7d3164c79089e0e7fca04d935dd87696436592a1..c49aab823b44a4ca3e902070dfadebb4f0cf4682 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -576,10 +576,9 @@ bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); - Function *I32 = - Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bitreverse, {I32Ty}); Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); - Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); + Value *ExtRes = + Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp}); Value *LShrOp = Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); Value *TruncRes = @@ -1260,9 +1259,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl( Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) : Builder.CreateUIToFP(IB,F32Ty); - Function *RcpDecl = Intrinsic::getOrInsertDeclaration( - Mod, Intrinsic::amdgcn_rcp, Builder.getFloatTy()); - Value *RCP = Builder.CreateCall(RcpDecl, { FB }); + Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, + Builder.getFloatTy(), {FB}); Value *FQM = Builder.CreateFMul(FA, RCP); // fq = trunc(fqm); @@ -1455,9 +1453,7 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder, // Initial estimate of inv(y). Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); - Function *Rcp = - Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); - Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); + Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY}); Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast(0x4F7FFFFE)); Value *ScaledY = Builder.CreateFMul(RcpY, Scale); Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index b2a3f9392157d1695c0e242756a2514d8e43e887..985fa8f1deff944401b99409875bf06efed981bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -169,5 +169,6 @@ def AMDGPURegBankCombiner : GICombiner< "AMDGPURegBankCombinerImpl", [unmerge_merge, unmerge_cst, unmerge_undef, zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, + redundant_and]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp index e48fed025857fa0250a9c28b79f0f18b3534d83b..179d8aa46f802ca07fb4bb77ce1f61100946d733 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -237,12 +237,10 @@ bool optimizeSection(ArrayRef> MergeableInsts) { else NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; - Function *NewIntrin = Intrinsic::getOrInsertDeclaration( - IIList.front()->getModule(), NewIntrinID, OverloadTys); Args[ImageDimIntr->DMaskIndex] = ConstantInt::get(DMask->getType(), NewMaskVal); Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); - CallInst *NewCall = B.CreateCall(NewIntrin, Args); + CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args); LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); NewCalls.push_back(NewCall); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 6a5a48778197e47f0529f6a114d1275fa9984956..8beb9defee66a04e2f66461e7ffffc32e70115a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -130,10 +130,7 @@ static std::optional modifyIntrinsicCall( // Modify arguments and types Func(Args, ArgTys); - Function *I = - Intrinsic::getOrInsertDeclaration(OldIntr.getModule(), NewIntr, ArgTys); - - CallInst *NewCall = IC.Builder.CreateCall(I, Args); + CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args); NewCall->takeName(&OldIntr); NewCall->copyMetadata(OldIntr); if (isa(NewCall)) @@ -891,12 +888,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // register (which contains the bitmask of live threads). So a // comparison that always returns true is the same as a read of the // EXEC register. - Function *NewF = Intrinsic::getOrInsertDeclaration( - II.getModule(), Intrinsic::read_register, II.getType()); Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; MDNode *MD = MDNode::get(II.getContext(), MDArgs); Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; - CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); + CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register, + II.getType(), Args); NewCall->addFnAttr(Attribute::Convergent); NewCall->takeName(&II); return IC.replaceInstUsesWith(II, NewCall); @@ -990,11 +986,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) break; - Function *NewF = Intrinsic::getOrInsertDeclaration( - II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); Value *Args[] = {SrcLHS, SrcRHS, ConstantInt::get(CC->getType(), SrcPred)}; - CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); + CallInst *NewCall = IC.Builder.CreateIntrinsic( + NewIID, {II.getType(), SrcLHS->getType()}, Args); NewCall->takeName(&II); return IC.replaceInstUsesWith(II, NewCall); } @@ -1402,9 +1397,8 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); } - Function *NewIntrin = Intrinsic::getOrInsertDeclaration( - II.getModule(), II.getIntrinsicID(), OverloadTys); - CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); + CallInst *NewCall = + IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args); NewCall->takeName(&II); NewCall->copyMetadata(II); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index eb553ae4eb80ff975870456b4326d79f98a44792..26d8ce77d9a9b2e26e3398862e3be01eed76cd63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -788,7 +788,8 @@ bool AMDGPULibCalls::fold(CallInst *CI) { B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1)); // Have to drop any nofpclass attributes on the original call site. Call->removeParamAttrs( - 1, AttributeFuncs::typeIncompatible(CastedArg->getType())); + 1, AttributeFuncs::typeIncompatible(CastedArg->getType(), + Call->getParamAttributes(1))); Call->setCalledFunction(PownFunc); Call->setArgOperand(1, CastedArg); return fold_pow(FPOp, B, PownInfo) || true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index d16c96f88e7b1e7e46319c530dd470dd8e55397e..6573176492b7f3c5cbc07e7d92b648b2acb83706 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -171,8 +171,8 @@ public: // Try to allocate SGPRs to preload implicit kernel arguments. void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, IRBuilder<> &Builder) { - StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr); - Function *ImplicitArgPtr = F.getParent()->getFunction(Name); + Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); if (!ImplicitArgPtr) return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 7d66d07c9d0fb7b4e1256be65fc9664b41870823..1bb5e794da7dd6fba037b9d39a5740a4b2026888 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -78,8 +78,7 @@ public: Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr : Intrinsic::amdgcn_dispatch_ptr; - StringRef Name = Intrinsic::getName(IntrinsicId); - return M.getFunction(Name); + return Intrinsic::getDeclarationIfExists(&M, IntrinsicId); } } // end anonymous namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index ff5eb81490106f8dfdf18addccf14a1167f93db4..5791daed00651fc89741b7bf14dac837c476d686 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -529,13 +529,11 @@ public: // block to spare deduplicating it later. auto [It, Inserted] = tableKernelIndexCache.try_emplace(F); if (Inserted) { - Function *Decl = Intrinsic::getOrInsertDeclaration( - &M, Intrinsic::amdgcn_lds_kernel_id, {}); - auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); IRBuilder<> Builder(&*InsertAt); - It->second = Builder.CreateCall(Decl, {}); + It->second = + Builder.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {}); } return It->second; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 63da3443479be3bb5aa0e9722f2cd7ed5aae44dc..f8744d6a483cffec269a2d96a78b1ea4be62c558 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -973,13 +973,10 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!IsAMDHSA) { - Function *LocalSizeYFn = Intrinsic::getOrInsertDeclaration( - Mod, Intrinsic::r600_read_local_size_y); - Function *LocalSizeZFn = Intrinsic::getOrInsertDeclaration( - Mod, Intrinsic::r600_read_local_size_z); - - CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); - CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); + CallInst *LocalSizeY = + Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {}, {}); + CallInst *LocalSizeZ = + Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {}, {}); ST.makeLIDRangeMetadata(LocalSizeY); ST.makeLIDRangeMetadata(LocalSizeZ); @@ -1021,10 +1018,8 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { // hsa_signal_t completion_signal; // uint64_t wrapper // } hsa_kernel_dispatch_packet_t // - Function *DispatchPtrFn = - Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); - - CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); + CallInst *DispatchPtr = + Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {}); DispatchPtr->addRetAttr(Attribute::NoAlias); DispatchPtr->addRetAttr(Attribute::NonNull); F.removeFnAttr("amdgpu-no-dispatch-ptr"); @@ -1564,13 +1559,10 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); - Function *ObjectSize = Intrinsic::getOrInsertDeclaration( - Mod, Intrinsic::objectsize, - {Intr->getType(), - PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)}); - CallInst *NewCall = Builder.CreateCall( - ObjectSize, + CallInst *NewCall = Builder.CreateIntrinsic( + Intrinsic::objectsize, + {Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)}, {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 1ee3c40d69a3b35f5e35f3c6ade673bf5395eff0..9087442caf8d0be9bb840abcafd9ec5fc3684b4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -146,44 +146,10 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( // count easily. // A tail call isn't considered a call for MachineFrameInfo's purposes. if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; - } - } - - if (ST.hasMAIInsts()) { - MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestAGPRReg = Reg; - break; - } - } - Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister - ? 0 - : TRI.getHWRegIndex(HighestAGPRReg) + 1; - } - - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; - } - } - - // We found the maximum register index. They start at 0, so add one to get - // the number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister - ? 0 - : TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister - ? 0 - : TRI.getHWRegIndex(HighestSGPRReg) + 1; - + Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); + Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); + if (ST.hasMAIInsts()) + Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); return Info; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index cfce56f0bfe968571e31908099c570437117aac8..dfa91904a734d0cde9b340df02b99a89258c9bb8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -921,9 +921,8 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, FunctionCallee AsanFreeFunc = M.getOrInsertFunction( StringRef("__asan_free_impl"), FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false)); - Value *ReturnAddr = IRB.CreateCall( - Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress), - IRB.getInt32(0)); + Value *ReturnAddr = + IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0)); Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty); Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty); IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt}); @@ -1056,9 +1055,7 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses( SetVector LDSInstructions; getLDSMemoryInstructions(Func, LDSInstructions); - Function *Decl = Intrinsic::getOrInsertDeclaration( - &M, Intrinsic::amdgcn_lds_kernel_id, {}); - auto *KernelId = IRB.CreateCall(Decl, {}); + auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {}); GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable; GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable; auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 16e23879cd735c89ce62b224a28419d735e919d1..e4cc522194f2a92130aa3acf3fcf6fdcac1b88e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1335,7 +1335,7 @@ void GCNPassConfig::addMachineSSAOptimization() { bool GCNPassConfig::addILPOpts() { if (EnableEarlyIfConversion) - addPass(&EarlyIfConverterID); + addPass(&EarlyIfConverterLegacyID); TargetPassConfig::addILPOpts(); return false; @@ -1718,17 +1718,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->reserveWWMRegister(ParsedReg); } - for (const auto &[_, Info] : PFS.VRegInfosNamed) { - for (uint8_t Flag : Info->Flags) { - MFI->setFlag(Info->VReg, Flag); - } - } - for (const auto &[_, Info] : PFS.VRegInfos) { - for (uint8_t Flag : Info->Flags) { - MFI->setFlag(Info->VReg, Flag); - } - } - auto parseAndCheckArgument = [&](const std::optional &A, const TargetRegisterClass &RC, ArgDescriptor &Arg, unsigned UserSGPRs, @@ -1994,6 +1983,13 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { addPass(RequireAnalysisPass()); } +void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const { + if (EnableEarlyIfConversion) + addPass(EarlyIfConverterPass()); + + Base::addILPOpts(addPass); +} + void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass, CreateMCStreamer) const { // TODO: Add AsmPrinter. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index af8476bc21ec615670f3ee21c959a793137b2518..d8a5111e5898d72db21a917f1843aefa98de8ebe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -172,6 +172,7 @@ public: void addIRPasses(AddIRPass &) const; void addCodeGenPrepare(AddIRPass &) const; void addPreISel(AddIRPass &addPass) const; + void addILPOpts(AddMachinePass &) const; void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const; Error addInstSelector(AddMachinePass &) const; void addMachineSSAOptimization(AddMachinePass &) const; diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/llvm/lib/Target/AMDGPU/R600FrameLowering.h index f171bc4fea781f79d22ac7a15d45c8a4f2fd1a91..c4621174acaba1375f7163260f65c08ac2c6d044 100644 --- a/llvm/lib/Target/AMDGPU/R600FrameLowering.h +++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.h @@ -27,9 +27,8 @@ public: StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override; - bool hasFP(const MachineFunction &MF) const override { - return false; - } +protected: + bool hasFPImpl(const MachineFunction &MF) const override { return false; } }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index bc162b0953a7beccbf207224241d4465496eaed0..13a2db7a87b4373020fc2f39839e52a7552a5191 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1805,7 +1805,7 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { // The FP for kernels is always known 0, so we never really need to setup an // explicit register for it. However, DisableFramePointerElim will force us to // use a register for it. -bool SIFrameLowering::hasFP(const MachineFunction &MF) const { +bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); // For entry & chain functions we can use an immediate offset in most cases, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index b3feb759ed811fbec1a49c11a9346520ad05eab6..938c75099a3bc30efb722eaa6c94518685e00036 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -66,6 +66,9 @@ public: MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: void emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB, @@ -82,8 +85,6 @@ private: Register ScratchWaveOffsetReg) const; public: - bool hasFP(const MachineFunction &MF) const override; - bool requiresStackPointerReference(const MachineFunction &MF) const; }; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8c197f231496128da1fa065e010278d5c8872b84..de9173e923ab5cea674668f8a42c49e4bdb49f34 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8786,7 +8786,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, const Module *M = MF.getFunction().getParent(); const GlobalValue *GV = - M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); + Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d676d561d081807a1556c81c3d3b69465c11e593..76c1ea4e74207ac272b7fb84907023d5e0e7e0eb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3441,7 +3441,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, : AMDGPU::V_MOV_B32_e32 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::S_MOV_B32; - APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1))); + APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)), + /*isSigned=*/true, /*implicitTrunc=*/true); if (RI.isAGPR(*MRI, DstReg)) { if (Is64Bit || !isInlineConstant(Imm)) @@ -7366,14 +7367,25 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, const DebugLoc &DL = Inst.getDebugLoc(); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) - .addImm(16) - .add(Inst.getOperand(1)); - BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) - .addImm(0) // src0_modifiers - .addReg(TmpReg) - .addImm(0) // clamp - .addImm(0); // omod + if (ST.useRealTrue16Insts()) { + BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg) + .add(Inst.getOperand(1)); + BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .addReg(TmpReg, 0, AMDGPU::hi16) + .addImm(0) // clamp + .addImm(0) // omod + .addImm(0); // op_sel0 + } else { + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Inst.getOperand(1)); + BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .addReg(TmpReg) + .addImm(0) // clamp + .addImm(0); // omod + } MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 087ca1f954464de232f8a20ca49dd1a795ca5a1c..42a1ffb8a26d4ad296e9733b35d5d070cb3e6851 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2149,6 +2149,8 @@ class getAsmVOP3P { @@ -2733,6 +2736,7 @@ def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>; def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; +def VOP_I32_I32_I32_I16 : VOPProfile <[i32, i32, i32, i16]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>; def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 8073aca7f197fb690da30a620f374b51b6a03200..faa0b6d6c3f506327558c46988b715ff55cd1a49 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1094,7 +1094,7 @@ def : Pat < // VOP1 Patterns //===----------------------------------------------------------------------===// -multiclass f16_fp_Pats { +multiclass f16_to_fp_Pats { // f16_to_fp patterns def : GCNPat < (f32 (any_f16_to_fp i32:$src0)), @@ -1121,25 +1121,42 @@ multiclass f16_fp_Pats; + // fp_to_fp16 patterns def : GCNPat < - (f64 (any_fpextend f16:$src)), - (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) + (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) >; - // fp_to_fp16 patterns + // This is only used on targets without half support + // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering def : GCNPat < - (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) >; +} + +let True16Predicate = NotHasTrue16BitInsts in +defm : f16_to_fp_Pats; + +let True16Predicate = UseFakeTrue16Insts in +defm : f16_to_fp_Pats; + +multiclass f16_fp_Pats { + def : GCNPat < + (f64 (any_fpextend f16:$src)), + (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) + >; def : GCNPat < (i32 (fp_to_sint f16:$src)), - (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) + (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src)) >; def : GCNPat < (i32 (fp_to_uint f16:$src)), - (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) + (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src)) >; def : GCNPat < @@ -1151,20 +1168,16 @@ multiclass f16_fp_Pats; - - // This is only used on targets without half support - // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering - def : GCNPat < - (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), - (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) - >; } let True16Predicate = NotHasTrue16BitInsts in -defm : f16_fp_Pats; +defm : f16_fp_Pats; + +let True16Predicate = UseRealTrue16Insts in +defm : f16_fp_Pats; let True16Predicate = UseFakeTrue16Insts in -defm : f16_fp_Pats; +defm : f16_fp_Pats; //===----------------------------------------------------------------------===// // VOP2 Patterns @@ -2774,16 +2787,27 @@ def : GCNPat < SSrc_i1:$src)) >; -let SubtargetPredicate = HasTrue16BitInsts in +let True16Predicate = UseRealTrue16Insts in def : GCNPat < (f16 (sint_to_fp i1:$src)), - (V_CVT_F16_F32_fake16_e32 ( - V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), - SSrc_i1:$src)) + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0) >; -let SubtargetPredicate = NotHasTrue16BitInsts in +let True16Predicate = UseFakeTrue16Insts in +def : GCNPat < + (f16 (sint_to_fp i1:$src)), + (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0) +>; + +let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (f16 (uint_to_fp i1:$src)), (V_CVT_F16_F32_e32 ( @@ -2791,13 +2815,25 @@ def : GCNPat < /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), SSrc_i1:$src)) >; -let SubtargetPredicate = HasTrue16BitInsts in + +let True16Predicate = UseRealTrue16Insts in def : GCNPat < (f16 (uint_to_fp i1:$src)), - (V_CVT_F16_F32_fake16_e32 ( - V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), - SSrc_i1:$src)) + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0) +>; + +let True16Predicate = UseFakeTrue16Insts in +def : GCNPat < + (f16 (uint_to_fp i1:$src)), + (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0) >; def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 20d48aa57adbdfa84b957d8b24e62804df61eba4..8de16974a3e79b64184d6532f89aab0c7fe340fd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3852,12 +3852,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, return 0; } -SmallVector -SIRegisterInfo::getVRegFlagsOfReg(Register Reg, - const MachineFunction &MF) const { - SmallVector RegFlags; - const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) - RegFlags.push_back("WWM_REG"); - return RegFlags; +unsigned +SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC) const { + for (MCPhysReg Reg : reverse(RC.getRegisters())) + if (MRI.isPhysRegUsed(Reg)) + return getHWRegIndex(Reg) + 1; + return 0; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index fe0b66f75bbaa229b5a2e27eb5e846da65fa7566..e12a41371c7fd7882ef7f72729a299266d1658bd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -458,13 +458,10 @@ public: unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const; - std::optional getVRegFlagValue(StringRef Name) const override { - return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG - : std::optional{}; - } - - SmallVector - getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override; + // \returns a number of registers of a given \p RC used in a function. + // Does not go inside function calls. + unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC) const; }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 78267d402b6c9e430a4f08ac86f967a72374cf3c..f0b0e378ad668daf9b2438ad452c1a50230e9964 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -213,12 +213,12 @@ static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth // it, as the reasonable values are already covered by s_movk_i32. ModifiedImm = ~SrcImm; - if (TII->isInlineConstant(APInt(32, ModifiedImm))) + if (TII->isInlineConstant(APInt(32, ModifiedImm, true))) return AMDGPU::V_NOT_B32_e32; } ModifiedImm = reverseBits(SrcImm); - if (TII->isInlineConstant(APInt(32, ModifiedImm))) + if (TII->isInlineConstant(APInt(32, ModifiedImm, true))) return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; return 0; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index be98d201a64a70e59f7f19a1e950786d349d7841..701aeda82c91ed33289cde39247df403509fb84e 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -503,7 +503,7 @@ let FPDPRounding = 1 in { defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; } // End FPDPRounding = 1 -let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in { +let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { def : GCNPat< (f32 (f16_to_fp i16:$src)), (V_CVT_F32_F16_e32 $src) @@ -513,7 +513,7 @@ def : GCNPat< (V_CVT_F16_F32_e32 $src) >; } -let OtherPredicates = [HasTrue16BitInsts] in { +let True16Predicate = UseRealTrue16Insts in { def : GCNPat< (f32 (f16_to_fp i16:$src)), (V_CVT_F32_F16_t16_e32 $src) @@ -523,6 +523,16 @@ def : GCNPat< (V_CVT_F16_F32_t16_e32 $src) >; } +let True16Predicate = UseFakeTrue16Insts in { +def : GCNPat< + (f32 (f16_to_fp i16:$src)), + (V_CVT_F32_F16_fake16_e32 $src) +>; +def : GCNPat< + (i16 (AMDGPUfp_to_f16 f32:$src)), + (V_CVT_F16_F32_fake16_e32 $src) +>; +} def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> { let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1); @@ -1417,15 +1427,14 @@ def : GCNPat < } // End OtherPredicates = [isGFX8Plus, p] -let OtherPredicates = [UseFakeTrue16Insts] in { +let True16Predicate = UseFakeTrue16Insts in { def : GCNPat< (i32 (DivergentUnaryFrag i16:$src)), (COPY $src) >; -} // End OtherPredicates = [UseFakeTrue16Insts] - +} // End True16Predicate = UseFakeTrue16Insts -let OtherPredicates = [UseRealTrue16Insts] in { +let True16Predicate = UseRealTrue16Insts in { def : GCNPat< (i32 (UniformUnaryFrag (i16 SReg_32:$src))), (COPY $src) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 639f9189cbe7e34c2b8f1aeec25b272dbd497420..e83ea57c61df13dbd47520c1dcb0114ebcbc29d9 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1664,8 +1664,8 @@ multiclass VOP3Only_Realtriple_gfx11_gfx12 op> : VOP3Only_Realtriple, VOP3Only_Realtriple; multiclass VOP3Only_Realtriple_t16_gfx11_gfx12 op, string asmName, string OpName = NAME> : - VOP3Only_Realtriple_t16, - VOP3Only_Realtriple_t16; + VOP3_Realtriple_t16_gfx11, + VOP3_Realtriple_t16_gfx12; multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12 op, string asmName, string OpName = NAME> { defm OpName#"_t16": VOP3Only_Realtriple_t16_gfx11_gfx12; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 78ca7a2f258cb34ea2203455822e37c728e0c608..34ecdb56e8689d63d0d705fbff2e02495fa23bc9 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -569,16 +569,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret); - let AsmVOP3DPP16 = !subst(", $src2_modifiers", "", - getAsmVOP3DPP16.ret>.ret); - let AsmVOP3DPP8 = !subst(", $src2_modifiers", "", - getAsmVOP3DPP8.ret>.ret); + let AsmVOP3Base = !subst(", $src2_modifiers", "", + getAsmVOP3Base.ret); } class VOP3_CVT_SR_F8_ByteSel_Profile : @@ -636,8 +630,8 @@ let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 -defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile>; -defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile>; +defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; +defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>; defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile>; defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile>; @@ -752,6 +746,8 @@ def : GCNPat<(DivergentBinFrag (or_oneuse i64:$src0, i64:$src1), i64:$src2), (i32 (EXTRACT_SUBREG $src1, sub1)), (i32 (EXTRACT_SUBREG $src2, sub1))), sub1)>; +} // End SubtargetPredicate = isGFX9Plus + // FIXME: Probably should hardcode clamp bit in pseudo and avoid this. class OpSelBinOpClampPat : GCNPat< @@ -760,9 +756,14 @@ class OpSelBinOpClampPat; -def : OpSelBinOpClampPat; -def : OpSelBinOpClampPat; -} // End SubtargetPredicate = isGFX9Plus +let SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts in { + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; +} // End SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts +let True16Predicate = UseFakeTrue16Insts in { + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; +} // End True16Predicate = UseFakeTrue16Insts multiclass IMAD32_Pats { def : GCNPat < @@ -871,21 +872,31 @@ let SubtargetPredicate = isGFX10Plus in { def : PermlanePat; } - defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile, add>; - defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile, sub>; - - def : OpSelBinOpClampPat; - def : OpSelBinOpClampPat; - - // Undo sub x, c -> add x, -c canonicalization since c is more likely - // an inline immediate than -c. - def : GCNPat< - (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)), - (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0) - >; + defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>; + defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>; } // End SubtargetPredicate = isGFX10Plus +let True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus in { + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; + // Undo sub x, c -> add x, -c canonicalization since c is more likely + // an inline immediate than -c. + def : GCNPat< + (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)), + (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0) + >; +} // End True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus + +let True16Predicate = UseFakeTrue16Insts in { + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; + def : GCNPat< + (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)), + (V_SUB_NC_U16_fake16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0) + >; +} // End True16Predicate = UseFakeTrue16Insts + let SubtargetPredicate = isGFX12Plus in { let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { defm V_PERMLANE16_VAR_B32 : VOP3Inst<"v_permlane16_var_b32", VOP3_PERMLANE_VAR_Profile>; @@ -1104,6 +1115,17 @@ multiclass VOP3_Realtriple_with_name_gfx11_gfx12 op, string opName, multiclass VOP3Dot_Realtriple_gfx11_gfx12 op> : VOP3Dot_Realtriple, VOP3Dot_Realtriple; +multiclass VOP3_Realtriple_t16_gfx11_gfx12 op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name, + VOP3_Realtriple_with_name; + +multiclass VOP3_Realtriple_t16_and_fake16_gfx11_gfx12 op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> { + defm opName#"_t16": VOP3_Realtriple_t16_gfx11_gfx12; + defm opName#"_fake16": VOP3_Realtriple_t16_gfx11_gfx12; +} + multiclass VOP3be_Real_gfx11_gfx12 op, string opName, string asmName> : VOP3be_Real, VOP3be_Real; @@ -1189,8 +1211,8 @@ defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", " defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">; -defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x303>; -defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x304>; +defm V_ADD_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x303, "v_add_nc_u16">; +defm V_SUB_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x304, "v_sub_nc_u16">; defm V_MUL_LO_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x305, "v_mul_lo_u16">; defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11_gfx12<0x306>; defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11_gfx12<0x307>; @@ -1198,8 +1220,8 @@ defm V_MAX_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30 defm V_MAX_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30a, "v_max_i16">; defm V_MIN_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30b, "v_min_u16">; defm V_MIN_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30c, "v_min_i16">; -defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30d, "V_ADD_I16", "v_add_nc_i16">; -defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30e, "V_SUB_I16", "v_sub_nc_i16">; +defm V_ADD_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30d, "v_add_nc_i16", "V_ADD_I16">; +defm V_SUB_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30e, "v_sub_nc_i16", "V_SUB_I16">; defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11_gfx12<0x311>; defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >; defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 05a7d907d237ae806a489bea1d073cfc1173872b..aab5dc7465d938e3da4dbeda0980b3c53a3fe1aa 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -111,7 +111,7 @@ class VOP3_Pseudo pattern = [], bit HasFP8DstByteSel = P.HasFP8DstByteSel; - let AsmOperands = !if(isVop3OpSel, + let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel), P.AsmVOP3OpSel, !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64)); @@ -178,6 +178,7 @@ class VOP3_Real : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } +// To avoid having different version of every type of operand depending on if +// they are part of a True16 instruction or not, the operand encoding should be +// the same for SGPR, imm, and VGPR_32 whether the instruction is True16 or not. +class VOP3a_t16 : Enc64 { + bits<11> vdst; + bits<4> src0_modifiers; + bits<11> src0; + bits<3> src1_modifiers; + bits<11> src1; + bits<3> src2_modifiers; + bits<11> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); + // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix + let Inst{11} = !if(P.HasSrc0Mods, src0_modifiers{2}, 0); + let Inst{12} = !if(P.HasSrc1Mods, src1_modifiers{2}, 0); + let Inst{13} = !if(P.HasSrc2Mods, src2_modifiers{2}, 0); + let Inst{14} = !if(!and(P.HasDst, P.HasSrc0Mods), src0_modifiers{3}, 0); + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{31-26} = 0x35; + let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); + let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0); + let Inst{60-59} = !if(P.HasOMod, omod, 0); + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); +} + class VOP3a_gfx6_gfx7 op, VOPProfile p> : VOP3a

{ let Inst{11} = !if(p.HasClamp, clamp{0}, 0); let Inst{25-17} = op; @@ -272,6 +308,10 @@ class VOP3e_gfx10 op, VOPProfile p> : VOP3a_gfx10 { class VOP3e_gfx11_gfx12 op, VOPProfile p> : VOP3e_gfx10; +class VOP3e_t16_gfx11_gfx12 op, VOPProfile p> : VOP3a_t16

{ + let Inst{25-16} = op; +} + class VOP3e_vi op, VOPProfile P> : VOP3a_vi { bits<8> vdst; let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); @@ -736,7 +776,12 @@ class VOP3_DPPe_Fields : VOP3_DPPe_Fields_Base { bits<8> src0; } +class VOP3_DPPe_Fields_t16 : VOP3_DPPe_Fields_Base { + bits<11> src0; +} + // Common refers to common between DPP and DPP8 +// Base refers to a shared base between T16 and regular instructions class VOP3_DPPe_Common_Base op, VOPProfile P> : Enc96 { bits<4> src0_modifiers; bits<3> src1_modifiers; @@ -748,7 +793,7 @@ class VOP3_DPPe_Common_Base op, VOPProfile P> : Enc96 { let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); - // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs. + // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0), !if(P.IsFP8SrcByteSel, byte_sel{1}, ?)); let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0), @@ -777,6 +822,16 @@ class VOP3_DPPe_Common op, VOPProfile P> : VOP3_DPPe_Common_Base let Inst{58-50} = !if(P.HasSrc2, src2, 0); } +class VOP3_DPPe_Common_t16 op, VOPProfile P> : VOP3_DPPe_Common_Base { + bits<11> vdst; + bits<11> src1; + bits<11> src2; + + let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); + let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0); +} + class VOP3P_DPPe_Common_Base op, VOPProfile P> : Enc96 { bits<4> src0_modifiers; bits<4> src1_modifiers; @@ -786,6 +841,7 @@ class VOP3P_DPPe_Common_Base op, VOPProfile P> : Enc96 { let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 + // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs. let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) @@ -810,6 +866,16 @@ class VOP3P_DPPe_Common op, VOPProfile P> : VOP3P_DPPe_Common_Base op, VOPProfile P> : VOP3P_DPPe_Common_Base { + bits<11> vdst; + bits<11> src1; + bits<11> src2; + + let Inst{7-0} = vdst{7-0}; + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); + let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0); +} + class VOP_DPP_Pseudo pattern=[], dag Ins = P.InsDPP, string asmOps = P.AsmDPP> : VOP_Pseudo { @@ -870,6 +936,7 @@ class VOP_DPP_Real : // Copy relevant pseudo op flags let isConvergent = ps.isConvergent; let SubtargetPredicate = ps.SubtargetPredicate; + let True16Predicate = ps.True16Predicate; let AssemblerPredicate = ps.AssemblerPredicate; let OtherPredicates = ps.OtherPredicates; let AsmMatchConverter = ps.AsmMatchConverter; @@ -928,11 +995,29 @@ class VOP3_DPP_Base op, VOPProfile P, bit IsDPP16> : + VOP3_DPPe_Common, + VOP3_DPPe_Fields { + + let Inst{40-32} = 0xfa; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{80-72} = dpp_ctrl; + let Inst{82} = !if(IsDPP16, fi, ?); + let Inst{83} = bound_ctrl; + + // Inst{87-84} ignored by hw + let Inst{91-88} = bank_mask; + let Inst{95-92} = row_mask; +} + class VOP3_DPP op, string OpName, VOPProfile P, bit IsDPP16, dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP), string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> : - VOP3_DPP_Base, VOP3_DPPe_Common, - VOP3_DPPe_Fields { + VOP3_DPP_Base, VOP3_DPP_Enc; + +class VOP3_DPP_Enc_t16 op, VOPProfile P, bit IsDPP16 > + : VOP3_DPPe_Common_t16, + VOP3_DPPe_Fields_t16 { let Inst{40-32} = 0xfa; let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); @@ -945,6 +1030,13 @@ class VOP3_DPP op, string OpName, VOPProfile P, bit IsDPP16, let Inst{95-92} = row_mask; } +class VOP3_DPP_t16 op, string OpName, VOPProfile P, bit IsDPP16, + dag InsDPP = !if (IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP), + string AsmDPP = !if (IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> + : VOP3_DPP_Base, + VOP3_DPP_Enc_t16 { +} + class VOP3P_DPP op, string OpName, VOPProfile P, bit IsDPP16, dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP), string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> : @@ -979,6 +1071,12 @@ class VOP3_DPP8e_Fields { bits<9> fi; } +class VOP3_DPP8e_Fields_t16 { + bits<11> src0; + bits<24> dpp8; + bits<9> fi; +} + class VOP_DPP8_Base : InstSI { @@ -1011,16 +1109,28 @@ class VOP3_DPP8_Base : let Size = 12; } +class VOP3_DPP8_Enc op, VOPProfile P> : + VOP3_DPPe_Common, + VOP3_DPP8e_Fields { + let Inst{40-32} = fi; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{95-72} = dpp8{23-0}; +} class VOP3_DPP8 op, string OpName, VOPProfile P> : - VOP3_DPP8_Base, VOP3_DPPe_Common, - VOP3_DPP8e_Fields { + VOP3_DPP8_Base, VOP3_DPP8_Enc; +class VOP3_DPP8_Enc_t16 op, VOPProfile P> : + VOP3_DPPe_Common_t16, + VOP3_DPP8e_Fields_t16 { let Inst{40-32} = fi; let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); let Inst{95-72} = dpp8{23-0}; } +class VOP3_DPP8_t16 op, string OpName, VOPProfile P> : + VOP3_DPP8_Base, VOP3_DPP8_Enc_t16; + class VOP3P_DPP8 op, string OpName, VOPProfile P> : VOP3_DPP8_Base, VOP3P_DPPe_Common, VOP3_DPP8e_Fields { @@ -1273,6 +1383,30 @@ class VOP3_Profile : VOP3_Pr } +class VOP3_Profile_True16 : VOPProfile_True16

{ + let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); + let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); + let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); + let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); + + let HasModifiers = + !if (Features.IsMAI, 0, + !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers)); + let IsSingle = 1; +} + +class VOP3_Profile_Fake16 : VOPProfile_Fake16

{ + let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); + let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); + let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); + let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); + + let HasModifiers = + !if (Features.IsMAI, 0, + !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers)); + let IsSingle = 1; +} + // consistently gives instructions a _e64 suffix multiclass VOP3Inst_Pseudo_Wrapper pattern = [], bit VOP3Only = 0> { def _e64 : VOP3_Pseudo; @@ -1325,11 +1459,33 @@ multiclass VOP3PseudoScalarInst; } +multiclass VOP3Inst_t16_with_profiles { + let True16Predicate = NotHasTrue16BitInsts in { + defm NAME : VOP3Inst; + } + let True16Predicate = UseRealTrue16Insts in { + defm _t16 : VOP3Inst; + } + let True16Predicate = UseFakeTrue16Insts in { + defm _fake16 : VOP3Inst; + } +} + +multiclass VOP3Inst_t16 + : VOP3Inst_t16_with_profiles, + VOP3_Profile_True16, VOP3_Profile_Fake16, + node, node_t16>; + //===----------------------------------------------------------------------===// // VOP3 DPP //===----------------------------------------------------------------------===// -class Base_VOP3_DPP16 op, VOP_DPP_Pseudo ps, string opName = ps.OpName> +class VOP3_DPP16_Helper op, VOP_DPP_Pseudo ps, string opName = ps.OpName> : VOP3_DPP { let VOP3_OPSEL = ps.Pfl.HasOpSel; let IsDOT = ps.IsDOT; @@ -1342,17 +1498,43 @@ class Base_VOP3_DPP16 op, VOP_DPP_Pseudo ps, string opName = ps.OpName> let OtherPredicates = ps.OtherPredicates; } +class VOP3_DPP16_t16_Helper op, VOP_DPP_Pseudo ps, + string opName = ps.OpName> + : VOP3_DPP_t16 { + let VOP3_OPSEL = ps.Pfl.HasOpSel; + let IsDOT = ps.IsDOT; + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let AssemblerPredicate = HasDPP16; + let SubtargetPredicate = HasDPP16; + let OtherPredicates = ps.OtherPredicates; +} + class VOP3_DPP16 op, VOP_DPP_Pseudo ps, int subtarget, string opName = ps.OpName> - : Base_VOP3_DPP16, SIMCInstr; + : VOP3_DPP16_Helper, SIMCInstr; + +class VOP3_DPP16_t16 op, VOP_DPP_Pseudo ps, int subtarget, + string opName = ps.OpName> + : VOP3_DPP16_t16_Helper, SIMCInstr; class VOP3_DPP16_Gen op, VOP_DPP_Pseudo ps, GFXGen Gen, - string opName = ps.OpName> : - VOP3_DPP16 { + string opName = ps.OpName> + : VOP3_DPP16 { let AssemblerPredicate = Gen.AssemblerPredicate; - let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate); - let DecoderNamespace = Gen.DecoderNamespace# - !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); + let DecoderNamespace = Gen.DecoderNamespace; +} + +class VOP3_DPP16_Gen_t16 op, VOP_DPP_Pseudo ps, GFXGen Gen, + string opName = ps.OpName> + : VOP3_DPP16_t16 { + let True16Predicate = + !if (ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate); + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = + Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16"); } class Base_VOP3_DPP8 op, VOP_Pseudo ps, string opName = ps.OpName> @@ -1366,11 +1548,25 @@ class Base_VOP3_DPP8 op, VOP_Pseudo ps, string opName = ps.OpName> let SubtargetPredicate = ps.SubtargetPredicate; let OtherPredicates = ps.OtherPredicates; + let True16Predicate = ps.True16Predicate; +} + +class Base_VOP3_DPP8_t16 op, VOP_Pseudo ps, string opName = ps.OpName> + : VOP3_DPP8_t16 { + let VOP3_OPSEL = ps.Pfl.HasOpSel; + let IsDOT = ps.IsDOT; + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + let OtherPredicates = ps.OtherPredicates; + let True16Predicate = ps.True16Predicate; } class Base_VOP3b_DPP16 op, VOP_DPP_Pseudo ps, string opName = ps.OpName> - : Base_VOP3_DPP16 { + : VOP3_DPP16_Helper { bits<7> sdst; let Inst{14 - 8} = sdst; } @@ -1381,6 +1577,12 @@ class VOP3b_DPP8_Base op, VOP_Pseudo ps, string opName = ps.OpName> let Inst{14 - 8} = sdst; } +class VOP3b_DPP8_Base_t16 op, VOP_Pseudo ps, string opName = ps.OpName> + : Base_VOP3_DPP8 { + bits<8> sdst; + let Inst{14 - 8} = sdst{7-1}; +} + //===----------------------------------------------------------------------===// // VOP3 GFX11, GFX12 //===----------------------------------------------------------------------===// @@ -1420,10 +1622,11 @@ multiclass VOP3Dot_Real_Base op, string opName = NAME, } multiclass VOP3_Real_with_name op, string opName, - string asmName, bit isSingle = 0> { + string asmName, string pseudo_mnemonic = "", bit isSingle = 0> { defvar ps = !cast(opName#"_e64"); let AsmString = asmName # ps.AsmOperands, IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + // FIXME-TRUE16 support FP8 instructions properly if ps.Pfl.IsFP8SrcByteSel then { def _e64#Gen.Suffix : VOP3_Real_Gen, @@ -1432,17 +1635,27 @@ multiclass VOP3_Real_with_name op, string opName, def _e64#Gen.Suffix : VOP3_Real_Gen, VOP3FP8OpSel_dst_bytesel_gfx11_gfx12; - } else if ps.Pfl.HasOpSel then { - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3OpSel_gfx11_gfx12; } else { - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3e_gfx11_gfx12; + if ps.Pfl.IsRealTrue16 then { + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3e_t16_gfx11_gfx12; + } else { + if ps.Pfl.HasOpSel then { + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3OpSel_gfx11_gfx12; + } else { + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3e_gfx11_gfx12; + } + } } } - def Gen.Suffix#"_VOP3_alias" : LetDummies, AMDGPUMnemonicAlias { + def Gen.Suffix#"_VOP3_alias" : LetDummies, + AMDGPUMnemonicAlias { let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1456,8 +1669,13 @@ multiclass VOP3_Real_No_Suffix op, string opName = NAME> { } multiclass VOP3_Real_dpp_Base op, string opName = NAME> { - def _e64_dpp#Gen.Suffix : - VOP3_DPP16_Gen(opName#"_e64"#"_dpp"), Gen>; + defvar ps = !cast(opName#"_e64"#"_dpp"); + if ps.Pfl.IsTrue16 then + def _e64_dpp#Gen.Suffix : + VOP3_DPP16_Gen_t16; + else + def _e64_dpp#Gen.Suffix : + VOP3_DPP16_Gen; } multiclass VOP3Dot_Real_dpp_Base op, string opName = NAME> { @@ -1552,18 +1770,14 @@ multiclass VOP3Only_Realtriple op> : VOP3_Realtriple; multiclass VOP3_Realtriple_with_name op, string opName, - string asmName, bit isSingle = 0> : - VOP3_Real_with_name, + string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Real_with_name, VOP3_Real_dpp_with_name, VOP3_Real_dpp8_with_name; multiclass VOP3Only_Realtriple_with_name op, string opName, string asmName> : - VOP3_Realtriple_with_name; - -multiclass VOP3Only_Realtriple_t16 op, string asmName, - string opName = NAME> - : VOP3Only_Realtriple_with_name; + VOP3_Realtriple_with_name; multiclass VOP3be_Realtriple< GFXGen Gen, bits<10> op, bit isSingle = 0, string opName = NAME, @@ -1579,6 +1793,16 @@ multiclass VOP3beOnly_Realtriple op> : // VOP3 GFX11 //===----------------------------------------------------------------------===// +// VOP1 and VOP2 depend on these triple defs + +multiclass VOP3_Realtriple_t16_gfx11 op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name; + +multiclass VOP3Only_Realtriple_t16_gfx11 op, string asmName, + string opName = NAME, string pseudo_mnemonic = ""> + : VOP3_Realtriple_t16_gfx11; + multiclass VOP3be_Real_gfx11 op, string opName, string asmName, bit isSingle = 0> : VOP3be_Real; @@ -1591,10 +1815,6 @@ multiclass VOP3_Realtriple_gfx11 op, bit isSingle = 0, string opName = NAME> : VOP3_Realtriple; -multiclass VOP3Only_Realtriple_t16_gfx11 op, string asmName, - string opName = NAME> - : VOP3Only_Realtriple_with_name; - //===----------------------------------------------------------------------===// // VOP3 GFX12 //===----------------------------------------------------------------------===// @@ -1610,6 +1830,16 @@ multiclass VOP3Only_Real_Base_gfx12 op> : multiclass VOP3Only_Realtriple_t16_gfx12 op> : VOP3Only_Realtriple; +multiclass VOP3_Realtriple_t16_gfx12 op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name; + +multiclass VOP3_Realtriple_t16_and_fake16_gfx12 op, string asmName, string opName = NAME, + string pseudo_mnemonic = "", bit isSingle = 0> { + defm opName#"_t16":VOP3_Realtriple_t16_gfx12; + defm opName#"_fake16":VOP3_Realtriple_t16_gfx12; +} + multiclass VOP3be_Real_with_name_gfx12 op, string opName, string asmName, bit isSingle = 0> { defvar ps = !cast(opName#"_e64"); @@ -1624,18 +1854,14 @@ multiclass VOP3be_Real_with_name_gfx12 op, string opName, } multiclass VOP3_Realtriple_with_name_gfx12 op, string opName, - string asmName, bit isSingle = 0> : - VOP3_Realtriple_with_name; + string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Realtriple_with_name; multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12 op, string opName, string asmName> : VOP3Only_Realtriple_with_name, VOP3Only_Realtriple_with_name; -multiclass VOP3Only_Realtriple_with_name_t16_gfx12 op, string asmName, - string opName = NAME> - : VOP3Only_Realtriple_with_name; - //===----------------------------------------------------------------------===// include "VOPCInstructions.td" @@ -1705,4 +1931,4 @@ def VOPTrue16Table : GenericTable { let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getTrue16OpcodeHelper"; -} \ No newline at end of file +} diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.cpp b/llvm/lib/Target/ARC/ARCFrameLowering.cpp index 1227fae13211a80441ee0ae0e5ae152add5372f4..472f1c13f362e53fc54ef192005d6d38c15c27d8 100644 --- a/llvm/lib/Target/ARC/ARCFrameLowering.cpp +++ b/llvm/lib/Target/ARC/ARCFrameLowering.cpp @@ -487,7 +487,7 @@ MachineBasicBlock::iterator ARCFrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(I); } -bool ARCFrameLowering::hasFP(const MachineFunction &MF) const { +bool ARCFrameLowering::hasFPImpl(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); bool HasFP = MF.getTarget().Options.DisableFramePointerElim(MF) || MF.getFrameInfo().hasVarSizedObjects() || diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.h b/llvm/lib/Target/ARC/ARCFrameLowering.h index 9951a09842c57fe96ff6b9a15067ed2591ad5ad5..089326fe32057ec13221f0bb12dc73c3d3465c90 100644 --- a/llvm/lib/Target/ARC/ARCFrameLowering.h +++ b/llvm/lib/Target/ARC/ARCFrameLowering.h @@ -54,8 +54,6 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; - bool hasFP(const MachineFunction &MF) const override; - MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; @@ -64,6 +62,9 @@ public: llvm::MachineFunction &, const llvm::TargetRegisterInfo *, std::vector &) const override; +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: void adjustStackToMatchRecords(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 3f28ce8ca4b559a9e115a0aba44301b44554eb2e..aad305cce0396198efa8b5425ff7ffd888d766fb 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -116,9 +116,12 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_iOS_SaveList; if (PushPopSplit == ARMSubtarget::SplitR7) - return STI.createAAPCSFrameChain() ? CSR_AAPCS_SplitPush_SaveList + return STI.createAAPCSFrameChain() ? CSR_AAPCS_SplitPush_R7_SaveList : CSR_ATPCS_SplitPush_SaveList; + if (PushPopSplit == ARMSubtarget::SplitR11AAPCSSignRA) + return CSR_AAPCS_SplitPush_R11_SaveList; + return CSR_AAPCS_SaveList; } diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td index d14424c2decac3057a2a0c0e63e0385698a57206..27f175a700336689ced368fafddb17c2a49c51a2 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/llvm/lib/Target/ARM/ARMCallingConv.td @@ -301,14 +301,17 @@ def CSR_ATPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush, def CSR_ATPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush, R10)>; -// When enforcing an AAPCS compliant frame chain, R11 is used as the frame -// pointer even for Thumb targets, where split pushes are necessary. -// This AAPCS alternative makes sure the frame index slots match the push -// order in that case. -def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R11, - R7, R6, R5, R4, - R10, R9, R8, - (sequence "D%u", 15, 8))>; +// Sometimes we need to split the push of the callee-saved GPRs into two +// regions, to ensure that the frame chain record is set up correctly. These +// list the callee-saved registers in the order they end up on the stack, which +// depends on whether the frame pointer is r7 or r11. +def CSR_AAPCS_SplitPush_R11 : CalleeSavedRegs<(add R10, R9, R8, R7, R6, R5, R4, + LR, R11, + (sequence "D%u", 15, 8))>; +def CSR_AAPCS_SplitPush_R7 : CalleeSavedRegs<(add LR, R11, + R7, R6, R5, R4, + R10, R9, R8, + (sequence "D%u", 15, 8))>; // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this' // and the pointer return value are both passed in R0 in these cases, this can diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index 3a2188adbec33b6c5f2e1605f73257887ee75226..bb437698296ce823be1c6898fe54c204473a5abd 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -398,6 +398,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", "AvoidCPSRPartialUpdate", "true", "Avoid CPSR partial update for OOO execution">; +/// FeatureAvoidMULS - If true, codegen would avoid using the MULS instruction, +/// prefering the thumb2 MUL which doesn't set flags. +def FeatureAvoidMULS : SubtargetFeature<"avoid-muls", + "AvoidMULS", "true", + "Avoid MULS instructions for M class cores">; + + /// Disable +1 predication cost for instructions updating CPSR. /// Enabled for Cortex-A57. /// True if disable +1 predication cost for instructions updating CPSR. Enabled for Cortex-A57. diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 60062a2422e4848114cfe64cc39268b439c29002..82b6f808688eb0da57a71eee7c686c3a1014b66d 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -199,6 +199,11 @@ SpillArea getSpillArea(Register Reg, // push {r0-r10, r12} GPRCS1 // vpush {r8-d15} DPRCS1 // push {r11, lr} GPRCS2 + // + // SplitR11AAPCSSignRA: + // push {r0-r10, r12} GPRSC1 + // push {r11, lr} GPRCS2 + // vpush {r8-d15} DPRCS1 // If FPCXTNS is spilled (for CMSE secure entryfunctions), it is always at // the top of the stack frame. @@ -246,7 +251,8 @@ SpillArea getSpillArea(Register Reg, return SpillArea::GPRCS1; case ARM::LR: - if (Variation == ARMSubtarget::SplitR11WindowsSEH) + if (Variation == ARMSubtarget::SplitR11WindowsSEH || + Variation == ARMSubtarget::SplitR11AAPCSSignRA) return SpillArea::GPRCS2; else return SpillArea::GPRCS1; @@ -317,13 +323,17 @@ bool ARMFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const { return true; } -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register. This is true if the function has variable sized allocas -/// or if frame pointer elimination is disabled. -bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { +/// hasFPImpl - Return true if the specified function should have a dedicated +/// frame pointer register. This is true if the function has variable sized +/// allocas or if frame pointer elimination is disabled. +bool ARMFrameLowering::hasFPImpl(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + // Check to see if the target want to forcibly keep frame pointer. + if (keepFramePointer(MF)) + return true; + // ABI-required frame pointer. if (MF.getTarget().Options.DisableFramePointerElim(MF)) return true; @@ -859,6 +869,9 @@ static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI, // This is a conservative estimation: Assume the frame pointer being r7 and // pc("r15") up to r8 getting spilled before (= 8 registers). int MaxRegBytes = 8 * 4; + if (PushPopSplit == ARMSubtarget::SplitR11AAPCSSignRA) + // Here, r11 can be stored below all of r4-r15. + MaxRegBytes = 11 * 4; if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) { // Here, r11 can be stored below all of r4-r15 plus d8-d15. MaxRegBytes = 11 * 4 + 8 * 8; @@ -931,17 +944,23 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } // Determine spill area sizes, and some important frame indices. + SpillArea FramePtrSpillArea; + bool BeforeFPPush = true; for (const CalleeSavedInfo &I : CSI) { Register Reg = I.getReg(); int FI = I.getFrameIdx(); - if (Reg == FramePtr) + SpillArea Area = getSpillArea(Reg, PushPopSplit, + AFI->getNumAlignedDPRCS2Regs(), RegInfo); + + if (Reg == FramePtr) { FramePtrSpillFI = FI; + FramePtrSpillArea = Area; + } if (Reg == ARM::D8) D8SpillFI = FI; - switch (getSpillArea(Reg, PushPopSplit, AFI->getNumAlignedDPRCS2Regs(), - RegInfo)) { + switch (Area) { case SpillArea::FPCXT: FPCXTSaveSize += 4; break; @@ -968,7 +987,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Move past FPCXT area. if (FPCXTSaveSize > 0) { LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true); + DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, BeforeFPPush); } // Allocate the vararg register save area. @@ -976,13 +995,15 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, MachineInstr::FrameSetup); LastPush = std::prev(MBBI); - DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, true); + DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, BeforeFPPush); } // Move past area 1. if (GPRCS1Size > 0) { GPRCS1Push = LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true); + DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, BeforeFPPush); + if (FramePtrSpillArea == SpillArea::GPRCS1) + BeforeFPPush = false; } // Determine starting offsets of spill areas. These offsets are all positive @@ -1006,21 +1027,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } else { DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize; } - int FramePtrOffsetInPush = 0; if (HasFP) { // Offset from the CFA to the saved frame pointer, will be negative. - int FPOffset = MFI.getObjectOffset(FramePtrSpillFI); + [[maybe_unused]] int FPOffset = MFI.getObjectOffset(FramePtrSpillFI); LLVM_DEBUG(dbgs() << "FramePtrSpillFI: " << FramePtrSpillFI << ", FPOffset: " << FPOffset << "\n"); assert(getMaxFPOffset(STI, *AFI, MF) <= FPOffset && "Max FP estimation is wrong"); - // Offset from the top of the GPRCS1 area to the saved frame pointer, will - // be negative. - FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize; - LLVM_DEBUG(dbgs() << "FramePtrOffsetInPush=" << FramePtrOffsetInPush - << ", FramePtrSpillOffset=" - << (MFI.getObjectOffset(FramePtrSpillFI) + NumBytes) - << "\n"); AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) + NumBytes); } @@ -1032,7 +1045,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // after DPRCS1. if (GPRCS2Size > 0 && PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) { GPRCS2Push = LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size); + DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush); + if (FramePtrSpillArea == SpillArea::GPRCS2) + BeforeFPPush = false; } // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our @@ -1045,7 +1060,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, else { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize, MachineInstr::FrameSetup); - DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize); + DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize, BeforeFPPush); } } @@ -1054,7 +1069,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Since vpush register list cannot have gaps, there may be multiple vpush // instructions in the prologue. while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VSTMDDB_UPD) { - DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI)); + DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI), + BeforeFPPush); LastPush = MBBI++; } } @@ -1073,7 +1089,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Move GPRCS2, if using using SplitR11WindowsSEH. if (GPRCS2Size > 0 && PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) { GPRCS2Push = LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size); + DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush); + if (FramePtrSpillArea == SpillArea::GPRCS2) + BeforeFPPush = false; } bool NeedsWinCFIStackAlloc = NeedsWinCFI; @@ -1174,28 +1192,51 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // into spill area 1, including the FP in R11. In either case, it // is in area one and the adjustment needs to take place just after // that push. - // FIXME: The above is not necessary true when PACBTI is enabled. - // AAPCS requires use of R11, and PACBTI gets in the way of regular pushes, - // so FP ends up on area two. MachineBasicBlock::iterator AfterPush; if (HasFP) { - AfterPush = std::next(GPRCS1Push); - unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push); - int FPOffset = PushSize + FramePtrOffsetInPush; - if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) { - AfterPush = std::next(GPRCS2Push); - emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII, - FramePtr, ARM::SP, 0, MachineInstr::FrameSetup); - } else { - emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII, - FramePtr, ARM::SP, FPOffset, - MachineInstr::FrameSetup); + MachineBasicBlock::iterator FPPushInst; + // Offset from SP immediately after the push which saved the FP to the FP + // save slot. + int64_t FPOffsetAfterPush; + switch (FramePtrSpillArea) { + case SpillArea::GPRCS1: + FPPushInst = GPRCS1Push; + FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) + + ArgRegsSaveSize + FPCXTSaveSize + + sizeOfSPAdjustment(*FPPushInst); + LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS1, offset " + << FPOffsetAfterPush << " after that push\n"); + break; + case SpillArea::GPRCS2: + FPPushInst = GPRCS2Push; + FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) + + ArgRegsSaveSize + FPCXTSaveSize + GPRCS1Size + + sizeOfSPAdjustment(*FPPushInst); + if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) + FPOffsetAfterPush += DPRCSSize + DPRGapSize; + LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS2, offset " + << FPOffsetAfterPush << " after that push\n"); + break; + default: + llvm_unreachable("frame pointer in unknown spill area"); + break; } + AfterPush = std::next(FPPushInst); + if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) + assert(FPOffsetAfterPush == 0); + + // Emit the MOV or ADD to set up the frame pointer register. + emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII, + FramePtr, ARM::SP, FPOffsetAfterPush, + MachineInstr::FrameSetup); + if (!NeedsWinCFI) { - if (FramePtrOffsetInPush + PushSize != 0) { + // Emit DWARF info to find the CFA using the frame pointer from this + // point onward. + if (FPOffsetAfterPush != 0) { unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( nullptr, MRI->getDwarfRegNum(FramePtr, true), - FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush)); + -MFI.getObjectOffset(FramePtrSpillFI))); BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1708,7 +1749,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && !isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 && STI.hasV5TOps() && MBB.succ_empty() && !hasPAC && - PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) { + (PushPopSplit != ARMSubtarget::SplitR11WindowsSEH && + PushPopSplit != ARMSubtarget::SplitR11AAPCSSignRA)) { Reg = ARM::PC; // Fold the return instruction into the LDM. DeleteRet = true; @@ -2365,7 +2407,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // to take advantage the eliminateFrameIndex machinery. This also ensures it // is spilled in the order specified by getCalleeSavedRegs() to make it easier // to combine multiple loads / stores. - bool CanEliminateFrame = !(requiresAAPCSFrameRecord(MF) && hasFP(MF)); + bool CanEliminateFrame = !(requiresAAPCSFrameRecord(MF) && hasFP(MF)) && + !MF.getTarget().Options.DisableFramePointerElim(MF); bool CS1Spilled = false; bool LRSpilled = false; unsigned NumGPRSpills = 0; @@ -2940,18 +2983,29 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots( const auto &AFI = *MF.getInfo(); if (AFI.shouldSignReturnAddress()) { // The order of register must match the order we push them, because the - // PEI assigns frame indices in that order. When compiling for return - // address sign and authenication, we use split push, therefore the orders - // we want are: - // LR, R7, R6, R5, R4, , R11, R10, R9, R8, D15-D8 - CSI.insert(find_if(CSI, - [=](const auto &CS) { - Register Reg = CS.getReg(); - return Reg == ARM::R10 || Reg == ARM::R11 || - Reg == ARM::R8 || Reg == ARM::R9 || - ARM::DPRRegClass.contains(Reg); - }), - CalleeSavedInfo(ARM::R12)); + // PEI assigns frame indices in that order. That order depends on the + // PushPopSplitVariation, there are only two cases which we use with return + // address signing: + switch (STI.getPushPopSplitVariation(MF)) { + case ARMSubtarget::SplitR7: + // LR, R7, R6, R5, R4, , R11, R10, R9, R8, D15-D8 + CSI.insert(find_if(CSI, + [=](const auto &CS) { + Register Reg = CS.getReg(); + return Reg == ARM::R10 || Reg == ARM::R11 || + Reg == ARM::R8 || Reg == ARM::R9 || + ARM::DPRRegClass.contains(Reg); + }), + CalleeSavedInfo(ARM::R12)); + break; + case ARMSubtarget::SplitR11AAPCSSignRA: + // With SplitR11AAPCSSignRA, R12 will always be the highest-addressed CSR + // on the stack. + CSI.insert(CSI.begin(), CalleeSavedInfo(ARM::R12)); + break; + default: + llvm_unreachable("Unexpected CSR split with return address signing"); + } } return false; diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h index 3f55884f80a43d77bc816f4207f5e1d70872f339..ff51f1a7af02293c2fbf16925f6a880afb1f87fa 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -41,11 +41,10 @@ public: MutableArrayRef CSI, const TargetRegisterInfo *TRI) const override; - bool keepFramePointer(const MachineFunction &MF) const override; + bool keepFramePointer(const MachineFunction &MF) const; bool enableCalleeSaveSkip(const MachineFunction &MF) const override; - bool hasFP(const MachineFunction &MF) const override; bool isFPReserved(const MachineFunction &MF) const; bool requiresAAPCSFrameRecord(const MachineFunction &MF) const; bool hasReservedCallFrame(const MachineFunction &MF) const override; @@ -87,6 +86,9 @@ public: const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef CSI, unsigned StmOpc, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index a35582bebb08a322bd33be75cbd00d5f3a805dde..5d679a1a916dc4a411bd0b8d069e2e5fd6afba73 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21141,30 +21141,26 @@ bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - // First, if the target has no DMB, see what fallback we can use. if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { - Function *MCR = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_mcr); Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; - return Builder.CreateCall(MCR, args); + return Builder.CreateIntrinsic(Intrinsic::arm_mcr, {}, args); } else { // Instead of using barriers, atomic accesses on these subtargets use // libcalls. llvm_unreachable("makeDMB on a target so old that it has no barriers"); } } else { - Function *DMB = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_dmb); // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); - return Builder.CreateCall(DMB, CDomain); + return Builder.CreateIntrinsic(Intrinsic::arm_dmb, {}, CDomain); } } @@ -21309,7 +21305,7 @@ bool ARMTargetLowering::shouldInsertFencesForAtomic( return InsertFencesForAtomic; } -bool ARMTargetLowering::useLoadStackGuardNode() const { +bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const { // ROPI/RWPI are not supported currently. return !Subtarget->isROPI() && !Subtarget->isRWPI(); } @@ -21417,9 +21413,9 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, if (ValueTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; - Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int); - Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); + Value *LoHi = + Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); @@ -21433,8 +21429,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; - Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int, Tys); - CallInst *CI = Builder.CreateCall(Ldrex, Addr); + CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr); CI->addParamAttr( 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy)); @@ -21445,9 +21440,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilderBase &Builder) const { if (!Subtarget->hasV7Ops()) return; - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_clrex)); + Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}, {}); } Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, @@ -21462,14 +21455,13 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, if (Val->getType()->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; - Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int); Type *Int32Ty = Type::getInt32Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); if (!Subtarget->isLittle()) std::swap(Lo, Hi); - return Builder.CreateCall(Strex, {Lo, Hi, Addr}); + return Builder.CreateIntrinsic(Int, {}, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; @@ -21602,14 +21594,13 @@ bool ARMTargetLowering::lowerInterleavedLoad( static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = Intrinsic::getOrInsertDeclaration( - LI->getModule(), LoadInts[Factor - 2], Tys); SmallVector Ops; Ops.push_back(BaseAddr); Ops.push_back(Builder.getInt32(LI->getAlign().value())); - return Builder.CreateCall(VldnFunc, Ops, "vldN"); + return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops, + /*FMFSource=*/nullptr, "vldN"); } else { assert((Factor == 2 || Factor == 4) && "expected interleave factor of 2 or 4 for MVE"); @@ -21617,12 +21608,11 @@ bool ARMTargetLowering::lowerInterleavedLoad( Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, PtrTy}; - Function *VldnFunc = - Intrinsic::getOrInsertDeclaration(LI->getModule(), LoadInts, Tys); SmallVector Ops; Ops.push_back(BaseAddr); - return Builder.CreateCall(VldnFunc, Ops, "vldN"); + return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr, + "vldN"); } }; @@ -21763,14 +21753,11 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {PtrTy, SubVecTy}; - Function *VstNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), StoreInts[Factor - 2], Tys); - SmallVector Ops; Ops.push_back(BaseAddr); append_range(Ops, Shuffles); Ops.push_back(Builder.getInt32(SI->getAlign().value())); - Builder.CreateCall(VstNFunc, Ops); + Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops); } else { assert((Factor == 2 || Factor == 4) && "expected interleave factor of 2 or 4 for MVE"); @@ -21778,15 +21765,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {PtrTy, SubVecTy}; - Function *VstNFunc = - Intrinsic::getOrInsertDeclaration(SI->getModule(), StoreInts, Tys); SmallVector Ops; Ops.push_back(BaseAddr); append_range(Ops, Shuffles); for (unsigned F = 0; F < Factor; F++) { Ops.push_back(Builder.getInt32(F)); - Builder.CreateCall(VstNFunc, Ops); + Builder.CreateIntrinsic(StoreInts, Tys, Ops); Ops.pop_back(); } } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 316f7d3b9bce5d96ebe02090edf88fb12e89c645..ef651bc3d84c0fa0f952bd7933355deca378c229 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -675,7 +675,7 @@ class VectorType; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; - bool useLoadStackGuardNode() const override; + bool useLoadStackGuardNode(const Module &M) const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td index 08f62d12f4a9f1b6256c82b37dbdb0387a39cbd6..b94a5fc1614697c2fdcb546c85ed13408fe5cdee 100644 --- a/llvm/lib/Target/ARM/ARMProcessors.td +++ b/llvm/lib/Target/ARM/ARMProcessors.td @@ -360,6 +360,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline, FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor, + FeatureAvoidMULS, FeatureFixCMSE_CVE_2021_35465]>; def : ProcessorModel<"star-mc1", CortexM4Model, [ARMv8mMainline, diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td index 212f22651f9f9456555bf7bd6c474747ad4ef47b..f37d0fe542b4f7ed4f332655fce30b59ab22b910 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -200,9 +200,9 @@ def FPEXC : ARMReg<8, "fpexc">; def FPINST : ARMReg<9, "fpinst">; def FPINST2 : ARMReg<10, "fpinst2">; // These encodings aren't actual instruction encodings, their encoding depends -// on the instruction they are used in and for VPR 32 was chosen such that it +// on the instruction they are used in and for VPR 64 was chosen such that it // always comes last in spr_reglist_with_vpr. -def VPR : ARMReg<32, "vpr">; +def VPR : ARMReg<64, "vpr">; def FPSCR_NZCVQC : ARMReg<2, "fpscr_nzcvqc">; def P0 : ARMReg<13, "p0">; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index c4a782bc40910a6d4034b533e84f0a6428ed5d79..9adfb1fab5f08473c9f0aa3026f2aedca23dc8e0 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -514,5 +514,12 @@ ARMSubtarget::getPushPopSplitVariation(const MachineFunction &MF) const { F.needsUnwindTableEntry() && (MFI.hasVarSizedObjects() || getRegisterInfo()->hasStackRealignment(MF))) return SplitR11WindowsSEH; + + // Returns R11SplitAAPCSBranchSigning if R11 and lr are not adjacent to each + // other in the list of callee saved registers in a frame, and branch + // signing is enabled. + if (MF.getInfo()->shouldSignReturnAddress() && + getFramePointerReg() == ARM::R11) + return SplitR11AAPCSSignRA; return NoSplit; } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 7917ddc17bdb812d97606bcb092c013a8046735c..214c5f1b45e556cf7390046a05bc9b2a10ac1370 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -105,6 +105,18 @@ public: /// vpush {d8-d15} /// push {r11, lr} SplitR11WindowsSEH, + + /// When generating AAPCS-compilant frame chains, R11 is the frame pointer, + /// and must be pushed adjacent to the return address (LR). Normally this + /// isn't a problem, because the only register between them is r12, which is + /// the intra-procedure-call scratch register, so doesn't need to be saved. + /// However, when PACBTI is in use, r12 contains the authentication code, so + /// does need to be saved. This means that we need a separate push for R11 + /// and LR. + /// push {r0-r10, r12} + /// push {r11, lr} + /// vpush {d8-d15} + SplitR11AAPCSSignRA, }; protected: diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 75fb90477f8854903155739bbbc3d165ccef6782..54eb0118d77887afe697dc17c3f0749badb03e0c 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -446,8 +446,8 @@ class ARMAsmParser : public MCTargetAsmParser { int tryParseShiftRegister(OperandVector &); std::optional tryParseShiftToken(); bool parseRegisterList(OperandVector &, bool EnforceOrder = true, - bool AllowRAAC = false, - bool AllowOutOfBoundReg = false); + bool AllowRAAC = false, bool IsLazyLoadStore = false, + bool IsVSCCLRM = false); bool parseMemory(OperandVector &); bool parseOperand(OperandVector &, StringRef Mnemonic); bool parseImmExpr(int64_t &Out); @@ -1158,7 +1158,8 @@ public: bool isFPImm() const { if (!isImm()) return false; const MCConstantExpr *CE = dyn_cast(getImm()); - if (!CE) return false; + if (!CE || !isUInt<32>(CE->getValue())) + return false; int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue())); return Val != -1; } @@ -2532,14 +2533,14 @@ public: void addCondCodeOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createImm(unsigned(getCondCode()))); - unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR; + unsigned RegNum = getCondCode() == ARMCC::AL ? ARM::NoRegister : ARM::CPSR; Inst.addOperand(MCOperand::createReg(RegNum)); } void addVPTPredNOperands(MCInst &Inst, unsigned N) const { assert(N == 3 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createImm(unsigned(getVPTPred()))); - unsigned RegNum = getVPTPred() == ARMVCC::None ? 0: ARM::P0; + unsigned RegNum = getVPTPred() == ARMVCC::None ? ARM::NoRegister : ARM::P0; Inst.addOperand(MCOperand::createReg(RegNum)); Inst.addOperand(MCOperand::createReg(0)); } @@ -3810,6 +3811,10 @@ public: Kind = k_FPSRegisterListWithVPR; else Kind = k_SPRRegisterList; + } else if (Regs.front().second == ARM::VPR) { + assert(Regs.size() == 1 && + "Register list starting with VPR expected to only contain VPR"); + Kind = k_FPSRegisterListWithVPR; } if (Kind == k_RegisterList && Regs.back().second == ARM::APSR) @@ -4607,7 +4612,8 @@ insertNoDuplicates(SmallVectorImpl> &Regs, /// Parse a register list. bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, - bool AllowRAAC, bool AllowOutOfBoundReg) { + bool AllowRAAC, bool IsLazyLoadStore, + bool IsVSCCLRM) { MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::LCurly)) return TokError("Token is not a Left Curly Brace"); @@ -4617,15 +4623,23 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, // Check the first register in the list to see what register class // this is a list of. - MCRegister Reg = tryParseRegister(); + bool AllowOutOfBoundReg = IsLazyLoadStore || IsVSCCLRM; + MCRegister Reg = tryParseRegister(AllowOutOfBoundReg); if (!Reg) return Error(RegLoc, "register expected"); if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE) return Error(RegLoc, "pseudo-register not allowed"); - // The reglist instructions have at most 16 registers, so reserve + // The reglist instructions have at most 32 registers, so reserve // space for that many. int EReg = 0; - SmallVector, 16> Registers; + SmallVector, 32> Registers; + + // Single-precision VSCCLRM can have double-precision registers in the + // register list. When VSCCLRMAdjustEncoding is true then we've switched from + // single-precision to double-precision and we pretend that these registers + // are encoded as S32 onwards, which we can do by adding 16 to the encoding + // value. + bool VSCCLRMAdjustEncoding = false; // Allow Q regs and just interpret them as the two D sub-registers. if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { @@ -4644,6 +4658,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, RC = &ARMMCRegisterClasses[ARM::SPRRegClassID]; else if (ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID]; + else if (Reg == ARM::VPR) + RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID]; else return Error(RegLoc, "invalid register in register list"); @@ -4684,6 +4700,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, while (Reg != EndReg) { Reg = getNextRegister(Reg); EReg = MRI->getEncodingValue(Reg); + if (VSCCLRMAdjustEncoding) + EReg += 16; if (!insertNoDuplicates(Registers, EReg, Reg)) { Warning(AfterMinusLoc, StringRef("duplicated register (") + ARMInstPrinter::getRegisterName(Reg) + @@ -4695,6 +4713,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, Parser.Lex(); // Eat the comma. RegLoc = Parser.getTok().getLoc(); MCRegister OldReg = Reg; + int EOldReg = EReg; const AsmToken RegTok = Parser.getTok(); Reg = tryParseRegister(AllowOutOfBoundReg); if (!Reg) @@ -4726,6 +4745,12 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, } continue; } + // VSCCLRM can switch from single-precision to double-precision only when + // S31 is followed by D16. + if (IsVSCCLRM && OldReg == ARM::S31 && Reg == ARM::D16) { + VSCCLRMAdjustEncoding = true; + RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID]; + } // The register must be in the same register class as the first. if ((Reg == ARM::RA_AUTH_CODE && RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) || @@ -4735,8 +4760,10 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, // exception is CLRM, which is order-independent anyway, so // there's no potential for confusion if you write clrm {r2,r1} // instead of clrm {r1,r2}. - if (EnforceOrder && - MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) { + EReg = MRI->getEncodingValue(Reg); + if (VSCCLRMAdjustEncoding) + EReg += 16; + if (EnforceOrder && EReg < EOldReg) { if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) Warning(RegLoc, "register list not in ascending order"); else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) @@ -4745,9 +4772,9 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, // VFP register lists must also be contiguous. if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] && RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] && - Reg != OldReg + 1) + EReg != EOldReg + 1) return Error(RegLoc, "non-contiguous register range"); - EReg = MRI->getEncodingValue(Reg); + if (!insertNoDuplicates(Registers, EReg, Reg)) { Warning(RegLoc, "duplicated register (" + RegTok.getString() + ") in register list"); @@ -6335,9 +6362,10 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { case AsmToken::LBrac: return parseMemory(Operands); case AsmToken::LCurly: { - bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm"; + bool IsLazyLoadStore = Mnemonic == "vlldm" || Mnemonic == "vlstm"; + bool IsVSCCLRM = Mnemonic == "vscclrm"; return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false, - AllowOutOfBoundReg); + IsLazyLoadStore, IsVSCCLRM); } case AsmToken::Dollar: case AsmToken::Hash: { @@ -7164,8 +7192,8 @@ bool ARMAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, // Add the carry setting operand, if necessary. if (CanAcceptCarrySet && CarrySetting) { SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size()); - Operands.push_back( - ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0, Loc, *this)); + Operands.push_back(ARMOperand::CreateCCOut( + CarrySetting ? ARM::CPSR : ARM::NoRegister, Loc, *this)); } // Add the predication code operand, if necessary. @@ -10372,7 +10400,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, case ARM::t2ASRri: if (isARMLowRegister(Inst.getOperand(0).getReg()) && isARMLowRegister(Inst.getOperand(1).getReg()) && - Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + Inst.getOperand(5).getReg() == + (inITBlock() ? ARM::NoRegister : ARM::CPSR) && !HasWideQualifier) { unsigned NewOpc; switch (Inst.getOpcode()) { @@ -10422,14 +10451,14 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, TmpInst.addOperand(Inst.getOperand(0)); // Rd if (isNarrow) TmpInst.addOperand(MCOperand::createReg( - Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0)); + Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister)); TmpInst.addOperand(Inst.getOperand(1)); // Rn TmpInst.addOperand(Inst.getOperand(2)); // Rm TmpInst.addOperand(Inst.getOperand(4)); // CondCode TmpInst.addOperand(Inst.getOperand(5)); if (!isNarrow) TmpInst.addOperand(MCOperand::createReg( - Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0)); + Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister)); Inst = TmpInst; return true; } @@ -10475,7 +10504,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, TmpInst.addOperand(Inst.getOperand(0)); // Rd if (isNarrow && !isMov) TmpInst.addOperand(MCOperand::createReg( - Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0)); + Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister)); TmpInst.addOperand(Inst.getOperand(1)); // Rn if (newOpc != ARM::t2RRX && !isMov) TmpInst.addOperand(MCOperand::createImm(Amount)); @@ -10483,7 +10512,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, TmpInst.addOperand(Inst.getOperand(4)); if (!isNarrow) TmpInst.addOperand(MCOperand::createReg( - Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0)); + Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister)); Inst = TmpInst; return true; } @@ -10684,7 +10713,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, !isARMLowRegister(Inst.getOperand(0).getReg()) || (Inst.getOperand(2).isImm() && (unsigned)Inst.getOperand(2).getImm() > 255) || - Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) || + Inst.getOperand(5).getReg() != + (inITBlock() ? ARM::NoRegister : ARM::CPSR) || HasWideQualifier) break; MCInst TmpInst; @@ -10852,7 +10882,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, if (isARMLowRegister(Inst.getOperand(0).getReg()) && (Inst.getOperand(1).isImm() && (unsigned)Inst.getOperand(1).getImm() <= 255) && - Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + Inst.getOperand(4).getReg() == + (inITBlock() ? ARM::NoRegister : ARM::CPSR) && !HasWideQualifier) { // The operands aren't in the same order for tMOVi8... MCInst TmpInst; @@ -10993,7 +11024,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, if ((isARMLowRegister(Inst.getOperand(1).getReg()) && isARMLowRegister(Inst.getOperand(2).getReg())) && Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && - Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + Inst.getOperand(5).getReg() == + (inITBlock() ? ARM::NoRegister : ARM::CPSR) && !HasWideQualifier) { unsigned NewOpc; switch (Inst.getOpcode()) { @@ -11029,7 +11061,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, isARMLowRegister(Inst.getOperand(2).getReg())) && (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() || Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) && - Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + Inst.getOperand(5).getReg() == + (inITBlock() ? ARM::NoRegister : ARM::CPSR) && !HasWideQualifier) { unsigned NewOpc; switch (Inst.getOpcode()) { diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 93b74905fc59fc32e172d15acd6f04fb82bedfd4..814b71d17319885230d77955fb3030be0b152e99 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -894,12 +894,13 @@ void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const { MCID.operands()[i].RegClass == ARM::CCRRegClassID) { if (i > 0 && MCID.operands()[i - 1].isPredicate()) continue; - MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR)); + MI.insert(I, + MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR)); return; } } - MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR)); + MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR)); } bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const { @@ -1528,15 +1529,19 @@ static const uint16_t DPRDecoderTable[] = { ARM::D28, ARM::D29, ARM::D30, ARM::D31 }; -static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { +// Does this instruction/subtarget permit use of registers d16-d31? +static bool PermitsD32(const MCInst &Inst, const MCDisassembler *Decoder) { + if (Inst.getOpcode() == ARM::VSCCLRMD || Inst.getOpcode() == ARM::VSCCLRMS) + return true; const FeatureBitset &featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + return featureBits[ARM::FeatureD32]; +} - bool hasD32 = featureBits[ARM::FeatureD32]; - - if (RegNo > 31 || (!hasD32 && RegNo > 15)) +static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + if (RegNo > (PermitsD32(Inst, Decoder) ? 31u : 15u)) return MCDisassembler::Fail; unsigned Register = DPRDecoderTable[RegNo]; @@ -1815,10 +1820,11 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, unsigned regs = fieldFromInstruction(Val, 1, 7); // In case of unpredictable encoding, tweak the operands. - if (regs == 0 || regs > 16 || (Vd + regs) > 32) { - regs = Vd + regs > 32 ? 32 - Vd : regs; + unsigned MaxReg = PermitsD32(Inst, Decoder) ? 32 : 16; + if (regs == 0 || (Vd + regs) > MaxReg) { + regs = Vd + regs > MaxReg ? MaxReg - Vd : regs; regs = std::max( 1u, regs); - regs = std::min(16u, regs); + regs = std::min(MaxReg, regs); S = MCDisassembler::SoftFail; } @@ -6446,20 +6452,31 @@ static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address, Inst.addOperand(MCOperand::createImm(ARMCC::AL)); Inst.addOperand(MCOperand::createReg(0)); - if (Inst.getOpcode() == ARM::VSCCLRMD) { - unsigned reglist = (fieldFromInstruction(Insn, 1, 7) << 1) | - (fieldFromInstruction(Insn, 12, 4) << 8) | + unsigned regs = fieldFromInstruction(Insn, 0, 8); + if (regs == 0) { + // Register list contains only VPR + } else if (Inst.getOpcode() == ARM::VSCCLRMD) { + unsigned reglist = regs | (fieldFromInstruction(Insn, 12, 4) << 8) | (fieldFromInstruction(Insn, 22, 1) << 12); if (!Check(S, DecodeDPRRegListOperand(Inst, reglist, Address, Decoder))) { return MCDisassembler::Fail; } } else { - unsigned reglist = fieldFromInstruction(Insn, 0, 8) | - (fieldFromInstruction(Insn, 22, 1) << 8) | - (fieldFromInstruction(Insn, 12, 4) << 9); - if (!Check(S, DecodeSPRRegListOperand(Inst, reglist, Address, Decoder))) { - return MCDisassembler::Fail; - } + unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 1) | + fieldFromInstruction(Insn, 22, 1); + // Registers past s31 are permitted and treated as being half of a d + // register, though both halves of each d register must be present. + unsigned max_reg = Vd + regs; + if (max_reg > 64 || (max_reg > 32 && (max_reg & 1))) + S = MCDisassembler::SoftFail; + unsigned max_sreg = std::min(32u, max_reg); + unsigned max_dreg = std::min(32u, max_reg / 2); + for (unsigned i = Vd; i < max_sreg; ++i) + if (!Check(S, DecodeSPRRegisterClass(Inst, i, Address, Decoder))) + return MCDisassembler::Fail; + for (unsigned i = 16; i < max_dreg; ++i) + if (!Check(S, DecodeDPRRegisterClass(Inst, i, Address, Decoder))) + return MCDisassembler::Fail; } Inst.addOperand(MCOperand::createReg(ARM::VPR)); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp index 5636cc6287ac46015a9b7b4eb5abd80e5abffb35..e4a2f8c8f2ea0ceb75b5f61ff458eba6ba3425c1 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp @@ -851,7 +851,7 @@ void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - if (MI->getOpcode() != ARM::t2CLRM) { + if (MI->getOpcode() != ARM::t2CLRM && MI->getOpcode() != ARM::VSCCLRMS) { assert(is_sorted(drop_begin(*MI, OpNum), [&](const MCOperand &LHS, const MCOperand &RHS) { return MRI.getEncodingValue(LHS.getReg()) < diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 92427b41f0bb3d02d76ecdadab4d7317830d859b..f24ac799b2ddae135318d7b8b7286434776359a4 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -1743,15 +1743,28 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op, unsigned Binary = 0; - if (SPRRegs || DPRRegs) { + if (SPRRegs || DPRRegs || Reg == ARM::VPR) { // VLDM/VSTM/VSCCLRM unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg); unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff; Binary |= (RegNo & 0x1f) << 8; - // Ignore VPR - if (MI.getOpcode() == ARM::VSCCLRMD || MI.getOpcode() == ARM::VSCCLRMS) + if (MI.getOpcode() == ARM::VSCCLRMD) + // Ignore VPR --NumRegs; + else if (MI.getOpcode() == ARM::VSCCLRMS) { + // The register list can contain both S registers and D registers, with D + // registers counting as two registers. VPR doesn't count towards the + // number of registers. + NumRegs = 0; + for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) { + Reg = MI.getOperand(I).getReg(); + if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg)) + NumRegs += 1; + else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) + NumRegs += 2; + } + } if (SPRRegs) Binary |= NumRegs; else diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 60211db8a61ae383e24af9094b3cd134c468b962..695eafff127093b5c4c7922c5cc30234d5d1fa5c 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -401,8 +401,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; } - Function *VCTP = Intrinsic::getOrInsertDeclaration(M, VCTPID); - Value *VCTPCall = Builder.CreateCall(VCTP, Processed); + Value *VCTPCall = Builder.CreateIntrinsic(VCTPID, {}, Processed); ActiveLaneMask->replaceAllUsesWith(VCTPCall); // Add the incoming value to the new phi. diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index f572af98600738562bf8d028d9e3e78185fb17fd..f4a9915a78b99d3eaa6cc2d3f77b3aec6ad55290 100644 --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -755,6 +755,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, Register Reg1 = MI->getOperand(1).getReg(); // t2MUL is "special". The tied source operand is second, not first. if (MI->getOpcode() == ARM::t2MUL) { + // MULS can be slower than MUL + if (!MinimizeSize && STI->avoidMULS()) + return false; Register Reg2 = MI->getOperand(2).getReg(); // Early exit if the regs aren't all low regs. if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1) diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp index 64dd0338bf60edc5e3f57f99e46bdfb35b3f04ac..91b0f8c6b2df48f04016ac0ba21298176463bc76 100644 --- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp +++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp @@ -232,7 +232,7 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF, // // Notice that strictly this is not a frame pointer because it contains SP after // frame allocation instead of having the original SP in function entry. -bool AVRFrameLowering::hasFP(const MachineFunction &MF) const { +bool AVRFrameLowering::hasFPImpl(const MachineFunction &MF) const { const AVRMachineFunctionInfo *FuncInfo = MF.getInfo(); return (FuncInfo->getHasSpills() || FuncInfo->getHasAllocas() || diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.h b/llvm/lib/Target/AVR/AVRFrameLowering.h index a550c0efbb8ef7fed6ec5a2a48e5d2d3a0c0357f..7baa5e9d62f60bd5423b4ba5490f90eb39a6133f 100644 --- a/llvm/lib/Target/AVR/AVRFrameLowering.h +++ b/llvm/lib/Target/AVR/AVRFrameLowering.h @@ -21,7 +21,6 @@ public: public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool hasFP(const MachineFunction &MF) const override; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef CSI, @@ -38,6 +37,9 @@ public: MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/BPF/BPFFrameLowering.cpp b/llvm/lib/Target/BPF/BPFFrameLowering.cpp index 8812cfdd86da43c88117e43ef2d7d920e6b73263..123b99f254234d5a5e1f56a251bf34ad8caeb567 100644 --- a/llvm/lib/Target/BPF/BPFFrameLowering.cpp +++ b/llvm/lib/Target/BPF/BPFFrameLowering.cpp @@ -20,7 +20,9 @@ using namespace llvm; -bool BPFFrameLowering::hasFP(const MachineFunction &MF) const { return true; } +bool BPFFrameLowering::hasFPImpl(const MachineFunction &MF) const { + return true; +} void BPFFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {} diff --git a/llvm/lib/Target/BPF/BPFFrameLowering.h b/llvm/lib/Target/BPF/BPFFrameLowering.h index a546351ec6cbbfb4f0d8561eb7e354817a1f75fe..6beffcbe69dd0b275d7bb0072da4fc487fe8c1f1 100644 --- a/llvm/lib/Target/BPF/BPFFrameLowering.h +++ b/llvm/lib/Target/BPF/BPFFrameLowering.h @@ -26,7 +26,6 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool hasFP(const MachineFunction &MF) const override; void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; @@ -35,6 +34,9 @@ public: MachineBasicBlock::iterator MI) const override { return MBB.erase(MI); } + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } #endif diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp index cedcbff1db24fc5339f68324ecdf25ce80eaed13..c023b5a0de5ad5f6d7d9fcb59af0f6fe8b66728a 100644 --- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp @@ -33,7 +33,7 @@ static Register getFPReg(const CSKYSubtarget &STI) { return CSKY::R8; } // callee saved register to save the value. static Register getBPReg(const CSKYSubtarget &STI) { return CSKY::R7; } -bool CSKYFrameLowering::hasFP(const MachineFunction &MF) const { +bool CSKYFrameLowering::hasFPImpl(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.h b/llvm/lib/Target/CSKY/CSKYFrameLowering.h index 69bf01cf1801e53c40044f54714f75a7b43bece9..0b3b287bb6a55b69eaeeaf9468c82ea77f741a85 100644 --- a/llvm/lib/Target/CSKY/CSKYFrameLowering.h +++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.h @@ -61,7 +61,6 @@ public: MutableArrayRef CSI, const TargetRegisterInfo *TRI) const override; - bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; bool hasReservedCallFrame(const MachineFunction &MF) const override; @@ -69,6 +68,9 @@ public: MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } // namespace llvm #endif diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index e8f56b18730d7117fd2cecc262210165b5a5b9f8..147b32b1ca990303204835920a189f5de32f34e6 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -802,6 +802,16 @@ def WaveIsFirstLane : DXILOp<110, waveIsFirstLane> { let attributes = [Attributes]; } +def WaveReadLaneAt: DXILOp<117, waveReadLaneAt> { + let Doc = "returns the value from the specified lane"; + let LLVMIntrinsic = int_dx_wave_readlane; + let arguments = [OverloadTy, Int32Ty]; + let result = OverloadTy; + let overloads = [Overloads]; + let stages = [Stages]; + let attributes = [Attributes]; +} + def WaveGetLaneIndex : DXILOp<111, waveGetLaneIndex> { let Doc = "returns the index of the current lane in the wave"; let LLVMIntrinsic = int_dx_wave_getlaneindex; diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp index d315d9bd16f4397476e99223df5c90900b010071..aa1f55c572df29971784f00b2ac9770a654b7e67 100644 --- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp +++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp @@ -1,65 +1,65 @@ -//===- DXILFinalizeLinkage.cpp - Finalize linkage of functions ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "DXILFinalizeLinkage.h" -#include "DirectX.h" -#include "llvm/Analysis/DXILResource.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" - -#define DEBUG_TYPE "dxil-finalize-linkage" - -using namespace llvm; - -static bool finalizeLinkage(Module &M) { - SmallPtrSet EntriesAndExports; - - // Find all entry points and export functions - for (Function &EF : M.functions()) { - if (!EF.hasFnAttribute("hlsl.shader") && !EF.hasFnAttribute("hlsl.export")) - continue; - EntriesAndExports.insert(&EF); - } - - for (Function &F : M.functions()) { - if (F.getLinkage() == GlobalValue::ExternalLinkage && - !EntriesAndExports.contains(&F)) { - F.setLinkage(GlobalValue::InternalLinkage); - } - } - - return false; -} - -PreservedAnalyses DXILFinalizeLinkage::run(Module &M, - ModuleAnalysisManager &AM) { - if (finalizeLinkage(M)) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); -} - -bool DXILFinalizeLinkageLegacy::runOnModule(Module &M) { - return finalizeLinkage(M); -} - -void DXILFinalizeLinkageLegacy::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved(); -} - -char DXILFinalizeLinkageLegacy::ID = 0; - -INITIALIZE_PASS_BEGIN(DXILFinalizeLinkageLegacy, DEBUG_TYPE, - "DXIL Finalize Linkage", false, false) -INITIALIZE_PASS_END(DXILFinalizeLinkageLegacy, DEBUG_TYPE, - "DXIL Finalize Linkage", false, false) - -ModulePass *llvm::createDXILFinalizeLinkageLegacyPass() { - return new DXILFinalizeLinkageLegacy(); -} +//===- DXILFinalizeLinkage.cpp - Finalize linkage of functions ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DXILFinalizeLinkage.h" +#include "DirectX.h" +#include "llvm/Analysis/DXILResource.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "dxil-finalize-linkage" + +using namespace llvm; + +static bool finalizeLinkage(Module &M) { + SmallPtrSet Funcs; + + // Collect non-entry and non-exported functions to set to internal linkage. + for (Function &EF : M.functions()) { + if (EF.hasFnAttribute("hlsl.shader") || EF.hasFnAttribute("hlsl.export")) + continue; + Funcs.insert(&EF); + } + + for (Function *F : Funcs) { + if (F->getLinkage() == GlobalValue::ExternalLinkage) + F->setLinkage(GlobalValue::InternalLinkage); + if (F->isDefTriviallyDead()) + M.getFunctionList().erase(F); + } + + return false; +} + +PreservedAnalyses DXILFinalizeLinkage::run(Module &M, + ModuleAnalysisManager &AM) { + if (finalizeLinkage(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +bool DXILFinalizeLinkageLegacy::runOnModule(Module &M) { + return finalizeLinkage(M); +} + +void DXILFinalizeLinkageLegacy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved(); +} + +char DXILFinalizeLinkageLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(DXILFinalizeLinkageLegacy, DEBUG_TYPE, + "DXIL Finalize Linkage", false, false) +INITIALIZE_PASS_END(DXILFinalizeLinkageLegacy, DEBUG_TYPE, + "DXIL Finalize Linkage", false, false) + +ModulePass *llvm::createDXILFinalizeLinkageLegacyPass() { + return new DXILFinalizeLinkageLegacy(); +} diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 99df485087207837a2a2961951ee42b07cf0d797..c62ba8c21d67917d729846e6d36d858b9bd37e4d 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -134,9 +134,8 @@ public: /// piecemeal way - we can add the casts in to avoid updating all of the uses /// or defs, and by the end all of the casts will be redundant. Value *createTmpHandleCast(Value *V, Type *Ty) { - Function *CastFn = Intrinsic::getOrInsertDeclaration( - &M, Intrinsic::dx_cast_handle, {Ty, V->getType()}); - CallInst *Cast = OpBuilder.getIRB().CreateCall(CastFn, {V}); + CallInst *Cast = OpBuilder.getIRB().CreateIntrinsic( + Intrinsic::dx_cast_handle, {Ty, V->getType()}, {V}); CleanupCasts.push_back(Cast); return Cast; } diff --git a/llvm/lib/Target/DirectX/DirectXFrameLowering.h b/llvm/lib/Target/DirectX/DirectXFrameLowering.h index 76a1450054be816331a018c10a1ed34444b2860c..85823556d555047b57a09f0673ded7e410588bd3 100644 --- a/llvm/lib/Target/DirectX/DirectXFrameLowering.h +++ b/llvm/lib/Target/DirectX/DirectXFrameLowering.h @@ -29,7 +29,8 @@ public: void emitPrologue(MachineFunction &, MachineBasicBlock &) const override {} void emitEpilogue(MachineFunction &, MachineBasicBlock &) const override {} - bool hasFP(const MachineFunction &) const override { return false; } +protected: + bool hasFPImpl(const MachineFunction &) const override { return false; } }; } // namespace llvm #endif // LLVM_DIRECTX_DIRECTXFRAMELOWERING_H diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index be714b5c87895abb9e3258dce5453308f6b40999..9844fd394aa4c5a7cb44e04a26e51555ffceb177 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -1,35 +1,38 @@ -//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++ -//-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -//===----------------------------------------------------------------------===// - -#include "DirectXTargetTransformInfo.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IntrinsicsDirectX.h" - -using namespace llvm; - -bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, - unsigned ScalarOpdIdx) { - switch (ID) { - default: - return false; - } -} - -bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( - Intrinsic::ID ID) const { - switch (ID) { - case Intrinsic::dx_frac: - case Intrinsic::dx_rsqrt: - return true; - default: - return false; - } -} +//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +//===----------------------------------------------------------------------===// + +#include "DirectXTargetTransformInfo.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsDirectX.h" + +using namespace llvm; + +bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, + unsigned ScalarOpdIdx) { + switch (ID) { + case Intrinsic::dx_wave_readlane: + return ScalarOpdIdx == 1; + default: + return false; + } +} + +bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( + Intrinsic::ID ID) const { + switch (ID) { + case Intrinsic::dx_frac: + case Intrinsic::dx_rsqrt: + case Intrinsic::dx_wave_readlane: + return true; + default: + return false; + } +} diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp index dae316ccb5e903454a37f4ac70bec21240ae46de..f68444c0b8d462b4d49fd7595e152d224c25c138 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp @@ -2503,7 +2503,8 @@ APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX, } uint64_t Val = MO.getImm(); - return APInt(32, Val, Signed); + // TODO: Is implicitTrunc correct here? + return APInt(32, Val, Signed, /*implicitTrunc=*/true); } void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) { diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 7c82f5e9f9a60469f5e08eab13ee36f6c09a6357..48acd9da9587fec0eea57e2288bed0d27f213ea6 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1144,10 +1144,7 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB, } } -bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { - if (MF.getFunction().hasFnAttribute(Attribute::Naked)) - return false; - +bool HexagonFrameLowering::hasFPImpl(const MachineFunction &MF) const { auto &MFI = MF.getFrameInfo(); auto &HRI = *MF.getSubtarget().getRegisterInfo(); bool HasExtraAlign = HRI.hasStackRealignment(MF); diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h index 98e69dcc4b391577504ce90e2871e370ca37cb1f..926aadb01f50e5dbc8c53a8f19cce961388758a3 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h @@ -89,7 +89,6 @@ public: StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override; - bool hasFP(const MachineFunction &MF) const override; const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const override { @@ -114,6 +113,9 @@ public: void insertCFIInstructions(MachineFunction &MF) const; +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: using CSIVect = std::vector; diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp index 65bbb1364488f757dfbd980ccf0b14bcbe906fa7..b44519a1286d070088520b0f19f30c667e9097a5 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -171,7 +171,7 @@ bool HexagonGenExtract::convert(Instruction *In) { // this value. if (!LogicalSR && (SR > SL)) return false; - APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL); + APInt A = APInt(BW, ~0ULL, true).lshr(SR).shl(SL); CM = ConstantInt::get(Ctx, A); } @@ -211,9 +211,8 @@ bool HexagonGenExtract::convert(Instruction *In) { IRBuilder<> IRB(In); Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; - Module *Mod = BB->getParent()->getParent(); - Function *ExtF = Intrinsic::getOrInsertDeclaration(Mod, IntId); - Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); + Value *NewIn = + IRB.CreateIntrinsic(IntId, {}, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); if (SL != 0) NewIn = IRB.CreateShl(NewIn, SL, CSL->getName()); In->replaceAllUsesWith(NewIn); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 03c12f5ce44707d341fcbf42f35f85459004ce0b..ab9bc559367787aeef5a40406682832f80c62ef2 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3859,15 +3859,13 @@ void HexagonTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const { - BasicBlock *BB = Builder.GetInsertBlock(); - Module *M = BB->getParent()->getParent(); unsigned SZ = ValueTy->getPrimitiveSizeInBits(); assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked : Intrinsic::hexagon_L4_loadd_locked; - Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID); - Value *Call = Builder.CreateCall(Fn, Addr, "larx"); + Value *Call = + Builder.CreateIntrinsic(IntID, {}, Addr, /*FMFSource=*/nullptr, "larx"); return Builder.CreateBitCast(Call, ValueTy); } @@ -3886,11 +3884,11 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilderBase &Builder, assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked : Intrinsic::hexagon_S4_stored_locked; - Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID); Val = Builder.CreateBitCast(Val, CastTy); - Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx"); + Value *Call = Builder.CreateIntrinsic(IntID, {}, {Addr, Val}, + /*FMFSource=*/nullptr, "stcx"); Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), ""); Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext())); return Ext; diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index d2cfd3851e711debb2f321bbd0689d83142b8d9c..ce933108b83b12a24648c7adfa56b12daf664bc5 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -2390,9 +2390,9 @@ auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo, Type *Int64Ty = Type::getInt64Ty(F.getContext()); Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst"); Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst"); - Function *FI = Intrinsic::getOrInsertDeclaration( - F.getParent(), Intrinsic::hexagon_S2_valignrb); - Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt}, "cup"); + Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb, {}, + {Hi64, Lo64, Amt}, + /*FMFSource=*/nullptr, "cup"); return Builder.CreateBitCast(Call, Lo->getType(), "cst"); } llvm_unreachable("Unexpected vector length"); @@ -2587,9 +2587,8 @@ auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder, unsigned HwLen = HST.getVectorLength(); Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast : Intrinsic::hexagon_V6_pred_typecast_128B; - Function *FI = Intrinsic::getOrInsertDeclaration(F.getParent(), TC, - {DestTy, Val->getType()}); - return Builder.CreateCall(FI, {Val}, "cup"); + return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val}, + /*FMFSource=*/nullptr, "cup"); }; Function *IntrFn = diff --git a/llvm/lib/Target/Lanai/LanaiFrameLowering.h b/llvm/lib/Target/Lanai/LanaiFrameLowering.h index 380d63df7301ef53721402a3a3659fd58020db45..9bd78d008f77e3c87b04a2191e9bda2935b56649 100644 --- a/llvm/lib/Target/Lanai/LanaiFrameLowering.h +++ b/llvm/lib/Target/Lanai/LanaiFrameLowering.h @@ -44,10 +44,11 @@ public: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; - bool hasFP(const MachineFunction & /*MF*/) const override { return true; } - void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; + +protected: + bool hasFPImpl(const MachineFunction & /*MF*/) const override { return true; } }; } // namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp index d643017b90db1ee77e4c5b4a85576a4431563873..89f8978e6873842e880550114697b9a06cc96015 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp @@ -13,18 +13,30 @@ #include "LoongArchAsmPrinter.h" #include "LoongArch.h" +#include "LoongArchInstrInfo.h" +#include "LoongArchMachineFunctionInfo.h" #include "LoongArchTargetMachine.h" #include "MCTargetDesc/LoongArchInstPrinter.h" +#include "MCTargetDesc/LoongArchMCTargetDesc.h" #include "TargetInfo/LoongArchTargetInfo.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/TargetRegistry.h" using namespace llvm; #define DEBUG_TYPE "loongarch-asm-printer" +cl::opt LArchAnnotateTableJump( + "loongarch-annotate-tablejump", cl::Hidden, + cl::desc( + "Annotate table jump instruction to correlate it with the jump table."), + cl::init(false)); + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "LoongArchGenMCPseudoLowering.inc" @@ -238,6 +250,39 @@ void LoongArchAsmPrinter::emitSled(const MachineInstr &MI, SledKind Kind) { recordSled(BeginOfSled, MI, Kind, 2); } +void LoongArchAsmPrinter::emitJumpTableInfo() { + AsmPrinter::emitJumpTableInfo(); + + if (!LArchAnnotateTableJump) + return; + + assert(TM.getTargetTriple().isOSBinFormatELF()); + + unsigned Size = getDataLayout().getPointerSize(); + auto *LAFI = MF->getInfo(); + unsigned EntrySize = LAFI->getJumpInfoSize(); + + if (0 == EntrySize) + return; + + // Emit an additional section to store the correlation info as pairs of + // addresses, each pair contains the address of a jump instruction (jr) and + // the address of the jump table. + OutStreamer->switchSection(MMI->getContext().getELFSection( + ".discard.tablejump_annotate", ELF::SHT_PROGBITS, 0)); + + for (unsigned Idx = 0; Idx < EntrySize; ++Idx) { + OutStreamer->emitValue( + MCSymbolRefExpr::create(LAFI->getJumpInfoJrMI(Idx)->getPreInstrSymbol(), + OutContext), + Size); + OutStreamer->emitValue( + MCSymbolRefExpr::create( + GetJTISymbol(LAFI->getJumpInfoJTIMO(Idx)->getIndex()), OutContext), + Size); + } +} + bool LoongArchAsmPrinter::runOnMachineFunction(MachineFunction &MF) { AsmPrinter::runOnMachineFunction(MF); // Emit the XRay table for this function. diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h index fc12f1079490bd5c0d25c98a2211935ce5f01cd3..312631e4f0dc55955a423f66b24b4f8ce877ac58 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h @@ -55,6 +55,7 @@ public: bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { return lowerLoongArchMachineOperandToMCOperand(MO, MCOp, *this); } + void emitJumpTableInfo() override; }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp index 33b93e42bb5c47f11291a1c2a2d522b9dc4f6824..4a10f2e282d13327aec7253a74b1cf371bb0abd6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp @@ -13,6 +13,7 @@ #include "LoongArch.h" #include "LoongArchInstrInfo.h" +#include "LoongArchMachineFunctionInfo.h" #include "LoongArchTargetMachine.h" #include "MCTargetDesc/LoongArchBaseInfo.h" #include "MCTargetDesc/LoongArchMCTargetDesc.h" @@ -27,6 +28,8 @@ using namespace llvm; +extern cl::opt LArchAnnotateTableJump; + #define LOONGARCH_PRERA_EXPAND_PSEUDO_NAME \ "LoongArch Pre-RA pseudo instruction expansion pass" #define LOONGARCH_EXPAND_PSEUDO_NAME \ @@ -103,6 +106,8 @@ private: MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI, bool IsTailCall); + void annotateTableJump(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); }; char LoongArchPreRAExpandPseudo::ID = 0; @@ -167,6 +172,12 @@ bool LoongArchPreRAExpandPseudo::expandMI( case LoongArch::PseudoTAIL_MEDIUM: case LoongArch::PseudoTAIL_LARGE: return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); + case LoongArch::PseudoBRIND: + // If the PseudoBRIND is used to table jump, then emit a label to annotate + // the `jr` instruction, and save the instructions. + if (LArchAnnotateTableJump) + annotateTableJump(MBB, MBBI); + break; } return false; } @@ -601,6 +612,44 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL( return true; } +void LoongArchPreRAExpandPseudo::annotateTableJump( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + bool IsFound = false; + + std::function FindJTIMI = [&](MachineInstr *MInst, + int FindDepth) { + if (FindDepth < 0) + return; + for (auto &MO : MInst->all_uses()) { + if (IsFound) + return; + Register Reg = MO.getReg(); + if (!Reg.isVirtual()) + continue; + MachineInstr *DefMI = MRI.getVRegDef(Reg); + if (!DefMI) + continue; + for (unsigned Idx = 0; Idx < DefMI->getNumOperands(); ++Idx) { + MachineOperand &MO = DefMI->getOperand(Idx); + if (MO.isJTI()) { + MBBI->setPreInstrSymbol( + *MF, MF->getContext().createNamedTempSymbol("jrtb_")); + MF->getInfo()->setJumpInfo(&*MBBI, &MO); + IsFound = true; + return; + } + } + FindJTIMI(DefMI, --FindDepth); + } + }; + + // FindDepth = 3, probably sufficient. + FindJTIMI(&*MBBI, /*FindDepth=*/3); +} + class LoongArchExpandPseudo : public MachineFunctionPass { public: const LoongArchInstrInfo *TII; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index d6a83c0c8cd8fbd4304736ad0e027bb5c49e6d6c..65802d660432d9176ed8523d4b1958b7e076980a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -19,12 +19,16 @@ def SDT_LoongArchMOVGR2FR_W_LA64 def SDT_LoongArchMOVFR2GR_S_LA64 : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; +def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; +def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; def loongarch_movgr2fr_w_la64 : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>; def loongarch_movfr2gr_s_la64 : SDNode<"LoongArchISD::MOVFR2GR_S_LA64", SDT_LoongArchMOVFR2GR_S_LA64>; def loongarch_ftint : SDNode<"LoongArchISD::FTINT", SDT_LoongArchFTINT>; +def loongarch_frecipe : SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchFRECIPE>; +def loongarch_frsqrte : SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchFRSQRTE>; //===----------------------------------------------------------------------===// // Instructions @@ -286,6 +290,8 @@ let Predicates = [HasFrecipe] in { // FP approximate reciprocal operation def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>; def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>; +def : Pat<(loongarch_frecipe FPR32:$src), (FRECIPE_S FPR32:$src)>; +def : Pat<(loongarch_frsqrte FPR32:$src), (FRSQRTE_S FPR32:$src)>; } // fmadd.s: fj * fk + fa diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index 30cce8439640f180ba334291540d996b1aba92d2..b98025643903af970caa9b64260807f71a5308bb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -253,6 +253,8 @@ let Predicates = [HasFrecipe] in { // FP approximate reciprocal operation def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>; def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>; +def : Pat<(loongarch_frecipe FPR64:$src), (FRECIPE_D FPR64:$src)>; +def : Pat<(loongarch_frsqrte FPR64:$src), (FRSQRTE_D FPR64:$src)>; } // fmadd.d: fj * fk + fa diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index 4e504729b23e2d8f7f6510b335899fb3e6586835..1a787c63c6241b8ef1ff0cb043f0774aa1c7c990 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -31,7 +31,7 @@ using namespace llvm; // pointer register. This is true if frame pointer elimination is // disabled, if it needs dynamic stack realignment, if the function has // variable sized allocas, or if the frame address is taken. -bool LoongArchFrameLowering::hasFP(const MachineFunction &MF) const { +bool LoongArchFrameLowering::hasFPImpl(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h index bc2ac02c91f814700a978cbf442a1f933be32411..6cbfcf665f6a93844212383e0275b074cdf86822 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h @@ -49,13 +49,15 @@ public: StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override; - bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const; bool enableShrinkWrapping(const MachineFunction &MF) const override; +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: void determineFrameLayout(MachineFunction &MF) const; void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 8edca34624e9b2dac4b560a2cc028493e5192bc0..676d43ef22c47b806488fbce9eb1f21cfda6e812 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -4697,6 +4697,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VANY_ZERO) NODE_NAME_CASE(VALL_NONZERO) NODE_NAME_CASE(VANY_NONZERO) + NODE_NAME_CASE(FRECIPE) + NODE_NAME_CASE(FRSQRTE) } #undef NODE_NAME_CASE return nullptr; @@ -5807,10 +5809,8 @@ Value *LoongArchTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty()); Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty()); Type *Tys[] = {AlignedAddr->getType()}; - Function *MaskedCmpXchg = - Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys); - Value *Result = Builder.CreateCall( - MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering}); + Value *Result = Builder.CreateIntrinsic( + CmpXchgIntrID, Tys, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering}); Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); return Result; } @@ -5902,6 +5902,71 @@ Register LoongArchTargetLowering::getExceptionSelectorRegister( return LoongArch::R5; } +//===----------------------------------------------------------------------===// +// Target Optimization Hooks +//===----------------------------------------------------------------------===// + +static int getEstimateRefinementSteps(EVT VT, + const LoongArchSubtarget &Subtarget) { + // Feature FRECIPE instrucions relative accuracy is 2^-14. + // IEEE float has 23 digits and double has 52 digits. + int RefinementSteps = VT.getScalarType() == MVT::f64 ? 2 : 1; + return RefinementSteps; +} + +SDValue LoongArchTargetLowering::getSqrtEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &RefinementSteps, + bool &UseOneConstNR, + bool Reciprocal) const { + if (Subtarget.hasFrecipe()) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32 || (VT == MVT::f64 && Subtarget.hasBasicD()) || + (VT == MVT::v4f32 && Subtarget.hasExtLSX()) || + (VT == MVT::v2f64 && Subtarget.hasExtLSX()) || + (VT == MVT::v8f32 && Subtarget.hasExtLASX()) || + (VT == MVT::v4f64 && Subtarget.hasExtLASX())) { + + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); + + SDValue Estimate = DAG.getNode(LoongArchISD::FRSQRTE, DL, VT, Operand); + if (Reciprocal) + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate); + + return Estimate; + } + } + + return SDValue(); +} + +SDValue LoongArchTargetLowering::getRecipEstimate(SDValue Operand, + SelectionDAG &DAG, + int Enabled, + int &RefinementSteps) const { + if (Subtarget.hasFrecipe()) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32 || (VT == MVT::f64 && Subtarget.hasBasicD()) || + (VT == MVT::v4f32 && Subtarget.hasExtLSX()) || + (VT == MVT::v2f64 && Subtarget.hasExtLSX()) || + (VT == MVT::v8f32 && Subtarget.hasExtLASX()) || + (VT == MVT::v4f64 && Subtarget.hasExtLASX())) { + + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); + + return DAG.getNode(LoongArchISD::FRECIPE, DL, VT, Operand); + } + } + + return SDValue(); +} + //===----------------------------------------------------------------------===// // LoongArch Inline Assembly Support //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6177884bd19501701e8d2dd3c76a59796aa9f96f..df6a55a2b83190599edf2f96e5a38f21400472ab 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -141,6 +141,10 @@ enum NodeType : unsigned { VALL_NONZERO, VANY_NONZERO, + // Floating point approximate reciprocal operation + FRECIPE, + FRSQRTE + // Intrinsic operations end ============================================= }; } // end namespace LoongArchISD @@ -216,6 +220,17 @@ public: Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { + return true; + } + + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps, bool &UseOneConstNR, + bool Reciprocal) const override; + + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const override; + ISD::NodeType getExtendForAtomicOps() const override { return ISD::SIGN_EXTEND; } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index dd7e5713e45fe9dfc74dd806dd3d87c5d3bfa6b9..d13cc9af135b57d9839bab62c7683b0334fe4c8f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +// Target nodes. def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>; def lasxsplati8 @@ -2094,6 +2095,15 @@ foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in def : Pat<(deriveLASXIntrinsic.ret (v4f64 LASX256:$xj)), (!cast(Inst) LASX256:$xj)>; + +def : Pat<(loongarch_vfrecipe v8f32:$src), + (XVFRECIPE_S v8f32:$src)>; +def : Pat<(loongarch_vfrecipe v4f64:$src), + (XVFRECIPE_D v4f64:$src)>; +def : Pat<(loongarch_vfrsqrte v8f32:$src), + (XVFRSQRTE_S v8f32:$src)>; +def : Pat<(loongarch_vfrsqrte v4f64:$src), + (XVFRSQRTE_D v4f64:$src)>; } def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index e7ac9f3bd04cbfe3cc3f9ae8a7e4157861a048c7..86aa6dcfd8261f517fc22334183086f88af7c251 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -23,6 +23,8 @@ def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>; def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>; +def SDT_LoongArchVFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def SDT_LoongArchVFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; // Target nodes. def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>; @@ -50,6 +52,8 @@ def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>; def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchV1RUimm>; def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_LoongArchV1RUimm>; +def loongarch_vfrecipe: SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchVFRECIPE>; +def loongarch_vfrsqrte: SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchVFRSQRTE>; def immZExt1 : ImmLeaf(Imm);}]>; def immZExt2 : ImmLeaf(Imm);}]>; @@ -2238,6 +2242,15 @@ foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in def : Pat<(deriveLSXIntrinsic.ret (v2f64 LSX128:$vj)), (!cast(Inst) LSX128:$vj)>; + +def : Pat<(loongarch_vfrecipe v4f32:$src), + (VFRECIPE_S v4f32:$src)>; +def : Pat<(loongarch_vfrecipe v2f64:$src), + (VFRECIPE_D v2f64:$src)>; +def : Pat<(loongarch_vfrsqrte v4f32:$src), + (VFRSQRTE_S v4f32:$src)>; +def : Pat<(loongarch_vfrsqrte v2f64:$src), + (VFRSQRTE_D v2f64:$src)>; } // load diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h index a7366a5dba0412578bbe4bd4ba4b40d21e228326..daa47c4dc7e322fd1fdbb3d6813e9654812f8ae1 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h @@ -39,6 +39,10 @@ private: /// Registers that have been sign extended from i32. SmallVector SExt32Registers; + /// Pairs of `jr` instructions and corresponding JTI operands, used for the + /// `annotate-tablejump` option. + SmallVector, 4> JumpInfos; + public: LoongArchMachineFunctionInfo(const Function &F, const TargetSubtargetInfo *STI) {} @@ -71,6 +75,15 @@ public: bool isSExt32Register(Register Reg) const { return is_contained(SExt32Registers, Reg); } + + void setJumpInfo(MachineInstr *JrMI, MachineOperand *JTIMO) { + JumpInfos.push_back(std::make_pair(JrMI, JTIMO)); + } + unsigned getJumpInfoSize() { return JumpInfos.size(); } + MachineInstr *getJumpInfoJrMI(unsigned Idx) { return JumpInfos[Idx].first; } + MachineOperand *getJumpInfoJTIMO(unsigned Idx) { + return JumpInfos[Idx].second; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/llvm/lib/Target/M68k/M68kFrameLowering.cpp index 1445bac0b92e855ee261b0fc32b477ea8d88cdac..4245061f0ae749de66f90b6692b4b9fd14a353d5 100644 --- a/llvm/lib/Target/M68k/M68kFrameLowering.cpp +++ b/llvm/lib/Target/M68k/M68kFrameLowering.cpp @@ -40,7 +40,7 @@ M68kFrameLowering::M68kFrameLowering(const M68kSubtarget &STI, Align Alignment) StackPtr = TRI->getStackRegister(); } -bool M68kFrameLowering::hasFP(const MachineFunction &MF) const { +bool M68kFrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *TRI = STI.getRegisterInfo(); diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.h b/llvm/lib/Target/M68k/M68kFrameLowering.h index a5349377232eb672e3a8c696c9b8ae22d6b99651..ed2bfb605ff13a225b7f98a61e0b86508b03e0bb 100644 --- a/llvm/lib/Target/M68k/M68kFrameLowering.h +++ b/llvm/lib/Target/M68k/M68kFrameLowering.h @@ -121,12 +121,6 @@ public: MutableArrayRef CSI, const TargetRegisterInfo *TRI) const override; - /// Return true if the specified function should have a dedicated frame - /// pointer register. This is true if the function has variable sized - /// allocas, if it needs dynamic stack realignment, if frame pointer - /// elimination is disabled, or if the frame address is taken. - bool hasFP(const MachineFunction &MF) const override; - /// Under normal circumstances, when a frame pointer is not required, we /// reserve argument space for call sites in the function immediately on /// entry to the current function. This eliminates the need for add/sub sp @@ -166,6 +160,13 @@ public: /// pointer by a constant value. void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int64_t NumBytes, bool InEpilogue) const; + +protected: + /// Return true if the specified function should have a dedicated frame + /// pointer register. This is true if the function has variable sized + /// allocas, if it needs dynamic stack realignment, if frame pointer + /// elimination is disabled, or if the frame address is taken. + bool hasFPImpl(const MachineFunction &MF) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp index d0dc6dd146efdb95414fecc2bc309fe55b572ca3..045dedfb38538528e329e72a0635f53ab00f8f2f 100644 --- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -30,7 +30,7 @@ MSP430FrameLowering::MSP430FrameLowering(const MSP430Subtarget &STI) Align(2)), STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {} -bool MSP430FrameLowering::hasFP(const MachineFunction &MF) const { +bool MSP430FrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/llvm/lib/Target/MSP430/MSP430FrameLowering.h index 5227d3e731edb3f9648044d9e81dc7834eee763d..daa4eec998ee87bfa5e98dc0bf0d46233730f48c 100644 --- a/llvm/lib/Target/MSP430/MSP430FrameLowering.h +++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.h @@ -24,6 +24,7 @@ class MSP430RegisterInfo; class MSP430FrameLowering : public TargetFrameLowering { protected: + bool hasFPImpl(const MachineFunction &MF) const override; public: MSP430FrameLowering(const MSP430Subtarget &STI); @@ -51,7 +52,6 @@ public: MutableArrayRef CSI, const TargetRegisterInfo *TRI) const override; - bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS = nullptr) const override; diff --git a/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/llvm/lib/Target/Mips/MipsFrameLowering.cpp index 99d225f9abfe892437c0088746bbeaa56bd42983..9b3edcd61ae1e2857db159126595b8a6f1c77d4d 100644 --- a/llvm/lib/Target/Mips/MipsFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsFrameLowering.cpp @@ -86,11 +86,11 @@ const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) { return llvm::createMipsSEFrameLowering(ST); } -// hasFP - Return true if the specified function should have a dedicated frame -// pointer register. This is true if the function has variable sized allocas, -// if it needs dynamic stack realignment, if frame pointer elimination is -// disabled, or if the frame address is taken. -bool MipsFrameLowering::hasFP(const MachineFunction &MF) const { +// hasFPImpl - Return true if the specified function should have a dedicated +// frame pointer register. This is true if the function has variable sized +// allocas, if it needs dynamic stack realignment, if frame pointer elimination +// is disabled, or if the frame address is taken. +bool MipsFrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *TRI = STI.getRegisterInfo(); diff --git a/llvm/lib/Target/Mips/MipsFrameLowering.h b/llvm/lib/Target/Mips/MipsFrameLowering.h index 710a3d40c38efcb4935f4d3a08fd5f047cbfa876..25adc33fbf5cab7f542daaa0eccf71f2b4fff5b1 100644 --- a/llvm/lib/Target/Mips/MipsFrameLowering.h +++ b/llvm/lib/Target/Mips/MipsFrameLowering.h @@ -23,6 +23,8 @@ class MipsFrameLowering : public TargetFrameLowering { protected: const MipsSubtarget &STI; + bool hasFPImpl(const MachineFunction &MF) const override; + public: explicit MipsFrameLowering(const MipsSubtarget &sti, Align Alignment) : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) { @@ -30,8 +32,6 @@ public: static const MipsFrameLowering *create(const MipsSubtarget &ST); - bool hasFP(const MachineFunction &MF) const override; - bool hasBP(const MachineFunction &MF) const; bool allocateScavengingFrameIndexesNearIncomingSP( diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 9abe0e3186f200d91bf999baab5d9f48f35d8e62..a5f6cab421fb7ec301eb3e4e0d203f3b8be4ab61 100644 --- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -27,7 +27,9 @@ using namespace llvm; NVPTXFrameLowering::NVPTXFrameLowering() : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {} -bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; } +bool NVPTXFrameLowering::hasFPImpl(const MachineFunction &MF) const { + return true; +} void NVPTXFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h index a5d49ac3ab29308008ef8d27155b55f5956c60a8..f8d1f978327bc0535a33eb40ce3215fa1582c0c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -22,7 +22,6 @@ class NVPTXFrameLowering : public TargetFrameLowering { public: explicit NVPTXFrameLowering(); - bool hasFP(const MachineFunction &MF) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, @@ -32,6 +31,9 @@ public: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 4a184037add4cec9a751b3ea340227b9a7b20e35..bb76cfd6fdb7bde1b0a81e21744ed7c49524e3ed 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -435,6 +435,9 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, continue; } + if (isa(CurUser)) + continue; + // supported for grid_constant if (IsGridConstant && (isa(CurUser) || isa(CurUser) || @@ -545,7 +548,8 @@ struct ArgUseChecker : PtrUseVisitor { void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { Function *Func = Arg->getParent(); - bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam(); + bool HasCvtaParam = + TM.getSubtargetImpl(*Func)->hasCvtaParam() && isKernelFunction(*Func); bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg); const DataLayout &DL = Func->getDataLayout(); BasicBlock::iterator FirstInst = Func->getEntryBlock().begin(); diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index cc20ad7822df3b886cab63c6e52cf31875b7d052..bf512481cf646047919d1c00d0a048a60834751d 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -1291,6 +1291,9 @@ bool PPCAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, llvm_unreachable("Implement any new match types added!"); } +#define GET_REGISTER_MATCHER +#include "PPCGenAsmMatcher.inc" + MCRegister PPCAsmParser::matchRegisterName(int64_t &IntVal) { if (getParser().getTok().is(AsmToken::Percent)) getParser().Lex(); // Eat the '%'. @@ -1298,55 +1301,25 @@ MCRegister PPCAsmParser::matchRegisterName(int64_t &IntVal) { if (!getParser().getTok().is(AsmToken::Identifier)) return MCRegister(); - MCRegister RegNo; StringRef Name = getParser().getTok().getString(); + MCRegister RegNo = MatchRegisterName(Name); + if (!RegNo) + return RegNo; + + Name.substr(Name.find_first_of("1234567890")).getAsInteger(10, IntVal); + + // MatchRegisterName doesn't seem to have special handling for 64bit vs 32bit + // register types. if (Name.equals_insensitive("lr")) { RegNo = isPPC64() ? PPC::LR8 : PPC::LR; IntVal = 8; } else if (Name.equals_insensitive("ctr")) { RegNo = isPPC64() ? PPC::CTR8 : PPC::CTR; IntVal = 9; - } else if (Name.equals_insensitive("vrsave")) { - RegNo = PPC::VRSAVE; + } else if (Name.equals_insensitive("vrsave")) IntVal = 256; - } else if (Name.starts_with_insensitive("r") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + else if (Name.starts_with_insensitive("r")) RegNo = isPPC64() ? XRegs[IntVal] : RRegs[IntVal]; - } else if (Name.starts_with_insensitive("f") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = FRegs[IntVal]; - } else if (Name.starts_with_insensitive("vs") && - !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) { - RegNo = VSRegs[IntVal]; - } else if (Name.starts_with_insensitive("v") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = VRegs[IntVal]; - } else if (Name.starts_with_insensitive("cr") && - !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { - RegNo = CRRegs[IntVal]; - } else if (Name.starts_with_insensitive("acc") && - !Name.substr(3).getAsInteger(10, IntVal) && IntVal < 8) { - RegNo = ACCRegs[IntVal]; - } else if (Name.starts_with_insensitive("wacc_hi") && - !Name.substr(7).getAsInteger(10, IntVal) && IntVal < 8) { - RegNo = ACCRegs[IntVal]; - } else if (Name.starts_with_insensitive("wacc") && - !Name.substr(4).getAsInteger(10, IntVal) && IntVal < 8) { - RegNo = WACCRegs[IntVal]; - } else if (Name.starts_with_insensitive("dmrrowp") && - !Name.substr(7).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = DMRROWpRegs[IntVal]; - } else if (Name.starts_with_insensitive("dmrrow") && - !Name.substr(6).getAsInteger(10, IntVal) && IntVal < 64) { - RegNo = DMRROWRegs[IntVal]; - } else if (Name.starts_with_insensitive("dmrp") && - !Name.substr(4).getAsInteger(10, IntVal) && IntVal < 4) { - RegNo = DMRROWpRegs[IntVal]; - } else if (Name.starts_with_insensitive("dmr") && - !Name.substr(3).getAsInteger(10, IntVal) && IntVal < 8) { - RegNo = DMRRegs[IntVal]; - } else - return MCRegister(); getParser().Lex(); return RegNo; @@ -1874,7 +1847,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() { RegisterMCAsmParser D(getThePPC64LETarget()); } -#define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION #define GET_MNEMONIC_SPELL_CHECKER #include "PPCGenAsmMatcher.inc" diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index da31a993b9c69450908a67e71c32e15545bfb215..72c5909f10c3bec6148f0bac7375640912444708 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -719,7 +719,8 @@ def PPCAsmWriter : AsmWriter { } def PPCAsmParser : AsmParser { - let ShouldEmitMatchRegisterName = 0; + let ShouldEmitMatchRegisterName = 1; + let AllowDuplicateRegisterNames = 1; } def PPCAsmParserVariant : AsmParserVariant { diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 9528aea3497999f7fa8d28373a97a109d0c4aa4f..b5a6c1c6e01df8cdd4383801259f7374f8114b0e 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -290,6 +290,8 @@ public: void emitPGORefs(Module &M); + void emitGCOVRefs(); + void emitEndOfAsmFile(Module &) override; void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const override; @@ -2962,6 +2964,31 @@ void PPCAIXAsmPrinter::emitPGORefs(Module &M) { } } +void PPCAIXAsmPrinter::emitGCOVRefs() { + if (!OutContext.hasXCOFFSection( + "__llvm_gcov_ctr_section", + XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD))) + return; + + MCSection *CtrSection = OutContext.getXCOFFSection( + "__llvm_gcov_ctr_section", SectionKind::getData(), + XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD), + /*MultiSymbolsAllowed*/ true); + + OutStreamer->switchSection(CtrSection); + const XCOFF::StorageMappingClass MappingClass = + TM.Options.XCOFFReadOnlyPointers ? XCOFF::XMC_RO : XCOFF::XMC_RW; + if (OutContext.hasXCOFFSection( + "__llvm_covinit", + XCOFF::CsectProperties(MappingClass, XCOFF::XTY_SD))) { + const char *SymbolStr = TM.Options.XCOFFReadOnlyPointers + ? "__llvm_covinit[RO]" + : "__llvm_covinit[RW]"; + MCSymbol *S = OutContext.getOrCreateSymbol(SymbolStr); + OutStreamer->emitXCOFFRefDirective(S); + } +} + void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { // If there are no functions and there are no toc-data definitions in this // module, we will never need to reference the TOC base. @@ -2969,6 +2996,7 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { return; emitPGORefs(M); + emitGCOVRefs(); // Switch to section to emit TOC base. OutStreamer->switchSection(getObjFileLowering().getTOCBaseSection()); diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index f7188b856461b72fd9546e096b6138e8b8545400..1083febc5f8520ddfb7a3591de1117c303cc60dc 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -355,9 +355,9 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF, return FrameSize; } -// hasFP - Return true if the specified function actually has a dedicated frame -// pointer register. -bool PPCFrameLowering::hasFP(const MachineFunction &MF) const { +// hasFPImpl - Return true if the specified function actually has a dedicated +// frame pointer register. +bool PPCFrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); // FIXME: This is pretty much broken by design: hasFP() might be called really // early, before the stack layout was calculated and thus hasFP() might return diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h index d74c87428326ca9e67ebef3eaa1641f9063bacaf..47f249862946f3166d6796e1b5a498a4093da11e 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -107,7 +107,6 @@ public: void inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const override; - bool hasFP(const MachineFunction &MF) const override; bool needsFP(const MachineFunction &MF) const; void replaceFPWithRealFP(MachineFunction &MF) const; @@ -176,6 +175,9 @@ public: void updateCalleeSaves(const MachineFunction &MF, BitVector &SavedRegs) const; uint64_t getStackThreshold() const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 911d92f0c4846bbdcdfba489629be7febde4b253..5d6c7c729a7617fc5ecd7f9ed73eb15698139c33 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12180,9 +12180,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, //===----------------------------------------------------------------------===// static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getOrInsertDeclaration(M, Id); - return Builder.CreateCall(Func, {}); + return Builder.CreateIntrinsic(Id, {}, {}); } // The mappings for emitLeading/TrailingFence is taken from @@ -12205,11 +12203,8 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. if (isa(Inst)) - return Builder.CreateCall( - Intrinsic::getOrInsertDeclaration( - Builder.GetInsertBlock()->getParent()->getParent(), - Intrinsic::ppc_cfence, {Inst->getType()}), - {Inst}); + return Builder.CreateIntrinsic(Intrinsic::ppc_cfence, {Inst->getType()}, + {Inst}); // FIXME: Can use isync for rmw operation. return callIntrinsic(Builder, Intrinsic::ppc_lwsync); } @@ -17889,10 +17884,10 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } // Override to enable LOAD_STACK_GUARD lowering on Linux. -bool PPCTargetLowering::useLoadStackGuardNode() const { - if (!Subtarget.isTargetLinux()) - return TargetLowering::useLoadStackGuardNode(); - return true; +bool PPCTargetLowering::useLoadStackGuardNode(const Module &M) const { + if (M.getStackProtectorGuard() == "tls" || Subtarget.isTargetLinux()) + return true; + return TargetLowering::useLoadStackGuardNode(M); } // Override to disable global variable loading on Linux and insert AIX canary @@ -19005,13 +19000,13 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = Incr->getType(); assert(ValTy->getPrimitiveSizeInBits() == 128); - Function *RMW = Intrinsic::getOrInsertDeclaration( - M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation())); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo"); Value *IncrHi = Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi"); - Value *LoHi = Builder.CreateCall(RMW, {AlignedAddr, IncrLo, IncrHi}); + Value *LoHi = Builder.CreateIntrinsic( + getIntrinsicForAtomicRMWBinOp128(AI->getOperation()), {}, + {AlignedAddr, IncrLo, IncrHi}); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 8907c3c5a81c3c47ef627128d3838ee16c49fac7..8c7961e641c3545920d89d6e6f98dd25946c9c7b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1137,7 +1137,7 @@ namespace llvm { getExceptionSelectorRegister(const Constant *PersonalityFn) const override; /// Override to support customized stack guard loading. - bool useLoadStackGuardNode() const override; + bool useLoadStackGuardNode(const Module &M) const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 48833e8f88066cd9565828ddc1c0b767e83fe665..bc2a1b295b433329d218bcb0320b5721fce8b9d4 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -35,6 +35,7 @@ #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/TargetRegistry.h" @@ -3107,9 +3108,16 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case TargetOpcode::LOAD_STACK_GUARD: { - assert(Subtarget.isTargetLinux() && - "Only Linux target is expected to contain LOAD_STACK_GUARD"); - const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008; + auto M = MBB.getParent()->getFunction().getParent(); + assert( + (Subtarget.isTargetLinux() || M->getStackProtectorGuard() == "tls") && + "Only Linux target or tls mode are expected to contain " + "LOAD_STACK_GUARD"); + int64_t Offset; + if (M->getStackProtectorGuard() == "tls") + Offset = M->getStackProtectorGuardOffset(); + else + Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008; const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2; MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ)); MachineInstrBuilder(*MI.getParent()->getParent(), MI) diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index dd07892794d59998c5bae59e3b0f18b8b5430679..fe9ab22c5763497890f2eee0e210b3fb1c563d66 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -3606,6 +3606,10 @@ def : Pat<(i64 (lround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; def : Pat<(i64 (lround f32:$S)), (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; +def : Pat<(i32 (lround f64:$S)), + (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>; +def : Pat<(i32 (lround f32:$S)), + (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; def : Pat<(i64 (llround f64:$S)), (i64 (MFVSRD (FCTID (XSRDPI $S))))>; def : Pat<(i64 (llround f32:$S)), diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 7d0455942923ddc924a6134bdb38bf840ce7d352..cd188304595e1891f169f03714de6aa9834ac48a 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -521,7 +521,7 @@ bool PPCPassConfig::addPreISel() { } bool PPCPassConfig::addILPOpts() { - addPass(&EarlyIfConverterID); + addPass(&EarlyIfConverterLegacyID); if (EnableMachineCombinerPass) addPass(&MachineCombinerID); diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 0b312c29061c9c1ddc5e9ea887159988be0936d7..05fc7c14b971ecb120691cf0ccbb006fcf77a0dd 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -3738,6 +3738,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, switch (Inst.getOpcode()) { default: break; + case RISCV::PseudoC_ADDI_NOP: + emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP)); + return false; case RISCV::PseudoLLAImm: case RISCV::PseudoLAImm: case RISCV::PseudoLI: { diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index d1449f751b40a025e643bd414c7fc862877e16c7..c06ab061ddc338b2a324150286cbc6a061b5af64 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -670,7 +670,7 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI, return true; } -bool RISCVLegalizerInfo::shouldBeInConstantPool(APInt APImm, +bool RISCVLegalizerInfo::shouldBeInConstantPool(const APInt &APImm, bool ShouldOptForSize) const { assert(APImm.getBitWidth() == 32 || APImm.getBitWidth() == 64); int64_t Imm = APImm.getSExtValue(); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h index d2afb175ae42bb1ddc93d955cb31380f2e41f2eb..ee8b08bc9e3162c4f82afa36573e305ca789e5c8 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h @@ -38,7 +38,7 @@ public: MachineInstr &MI) const override; private: - bool shouldBeInConstantPool(APInt APImm, bool ShouldOptForSize) const; + bool shouldBeInConstantPool(const APInt &APImm, bool ShouldOptForSize) const; bool legalizeShlAshrLshr(MachineInstr &MI, MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index ce1cbdad771fab35fea68aee407f5fa1449490d8..3d3a29c3e03530d99a813162e504469b631e2239 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -309,7 +309,7 @@ static Register getMaxPushPopReg(const MachineFunction &MF, // pointer register. This is true if frame pointer elimination is // disabled, if it needs dynamic stack realignment, if the function has // variable sized allocas, or if the frame address is taken. -bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const { +bool RISCVFrameLowering::hasFPImpl(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index f77dfee3c1722ca56a90fafb366af6dfbfc46307..b2034ac8788238ddd7bc36e259f36c9234f1866d 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -37,8 +37,6 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; - bool hasFP(const MachineFunction &MF) const override; - bool hasBP(const MachineFunction &MF) const; bool hasReservedCallFrame(const MachineFunction &MF) const override; @@ -83,6 +81,8 @@ public: protected: const RISCVSubtarget &STI; + bool hasFPImpl(const MachineFunction &MF) const override; + private: void determineFrameLayout(MachineFunction &MF) const; void adjustStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 0656928ca41f835e6f29d2a96f40607e2cf01022..67e93b812421b4962ede4c3b1ad5c656f798218d 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -274,7 +274,7 @@ def : Pat<(i32 (xor GPR:$rs1, (not GPR:$rs2))), (XNOR GPR:$rs1, GPR:$rs2)>; def : PatGprGpr; def : PatGprGpr; -def : Pat<(i32 (rotl GPR:$rs1, uimm5i32:$imm)), +def : Pat<(i32 (rotr GPR:$rs1, uimm5i32:$imm)), (RORIW GPR:$rs1, (i64 (as_i64imm $imm)))>; def : Pat<(i32 (rotl GPR:$rs1, uimm5i32:$rs2)), diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e4adf310be49977b35fd9f62e09e2ce919301ced..419221fdfa9a33ab82024d7d3a41549581049b26 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -38,6 +38,8 @@ #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/IntrinsicsEPI.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -646,8 +648,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Subtarget.is64Bit() ? Legal : Custom); - setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); - setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); + if (Subtarget.is64Bit()) { + setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); + } setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Legal); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -762,6 +766,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMINIMUM, ISD::VECREDUCE_FMAXIMUM}; + static const unsigned FloatingPointLibCallOps[] = { + ISD::FREM, ISD::FPOW, ISD::FCOS, ISD::FSIN, ISD::FSINCOS, ISD::FEXP, + ISD::FEXP2, ISD::FEXP10, ISD::FLOG, ISD::FLOG2, ISD::FLOG10}; + if (!Subtarget.is64Bit()) { // We must custom-lower certain vXi64 operations on RV32 due to the vector // element type being illegal. @@ -1069,17 +1077,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(FloatingPointVecReduceOps, VT, Custom); // Expand FP operations that need libcalls. - setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FPOW, VT, Expand); - setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FSIN, VT, Expand); - setOperationAction(ISD::FSINCOS, VT, Expand); - setOperationAction(ISD::FEXP, VT, Expand); - setOperationAction(ISD::FEXP2, VT, Expand); - setOperationAction(ISD::FEXP10, VT, Expand); - setOperationAction(ISD::FLOG, VT, Expand); - setOperationAction(ISD::FLOG2, VT, Expand); - setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(FloatingPointLibCallOps, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Legal); @@ -1144,7 +1142,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE, - ISD::VECTOR_REVERSE}, + ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom); MVT EltVT = VT.getVectorElementType(); if (isTypeLegal(EltVT)) @@ -1165,6 +1163,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); + // Expand FP operations that need libcalls. + setOperationAction(FloatingPointLibCallOps, VT, Expand); + // Custom split nxv32[b]f16 since nxv32[b]f32 is not legal. if (getLMUL(VT) == RISCVII::VLMUL::LMUL_8) { setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); @@ -1461,7 +1462,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction( {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT, Custom); - // TODO: Promote to fp32. + MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + // Don't promote f16 vector operations to f32 if f32 vector type is + // not legal. + // TODO: could split the f16 vector into two vectors and do promotion. + if (!isTypeLegal(F32VecVT)) + continue; + setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + // TODO: Promote VP ops to fp32. continue; } @@ -4046,7 +4054,9 @@ static std::optional isSimpleVIDSequence(SDValue Op, if (!Elt) continue; APInt ExpectedVal = - (APInt(EltSizeInBits, Idx) * *SeqStepNum).sdiv(*SeqStepDenom); + (APInt(EltSizeInBits, Idx, /*isSigned=*/false, /*implicitTrunc=*/true) * + *SeqStepNum) + .sdiv(*SeqStepDenom); APInt Addend = *Elt - ExpectedVal; if (!SeqAddend) @@ -8504,120 +8514,111 @@ SDValue RISCVTargetLowering::lowerINIT_TRAMPOLINE(SDValue Op, if (!Subtarget.is64Bit()) llvm::report_fatal_error("Trampolines only implemented for RV64"); + // Create an MCCodeEmitter to encode instructions. + TargetLoweringObjectFile *TLO = getTargetMachine().getObjFileLowering(); + assert(TLO); + MCContext &MCCtx = TLO->getContext(); + + std::unique_ptr CodeEmitter( + createRISCVMCCodeEmitter(*getTargetMachine().getMCInstrInfo(), MCCtx)); + SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline - SDValue FPtr = Op.getOperand(2); // nested function - SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl(Op); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); - // GCC is using this sequence (the trampoline uses 32 bytes) - // 0: auipc t2, 0 - // 4: ld t0, 24(t2) - // 8: ld t2, 16(t2) - // 12: jr t0 - // 16: - // 24: - - // Constants shamelessly taken from GCC. - constexpr unsigned TRAMPOLINE_CODE_SIZE = 16; - constexpr unsigned OPCODE_AUIPC = 0x17; - constexpr unsigned OPCODE_LD = 0x3003; - constexpr unsigned OPCODE_JALR = 0x67; - constexpr unsigned SHIFT_RD = 7; - constexpr unsigned SHIFT_RS1 = 15; - constexpr unsigned SHIFT_IMM = 20; - constexpr unsigned RISCV_PROLOGUE_TEMP_REGNUM = 0x5; // t0 - constexpr unsigned STATIC_CHAIN_REGNUM = 0x7; // t2 - - constexpr unsigned static_chain_offset = TRAMPOLINE_CODE_SIZE; - constexpr unsigned target_function_offset = 24; - - // Heavily inspired by what x86 does. - SDValue OutChains[6]; - SDValue Addr = Trmp; - - constexpr uint32_t AUIPC_T2_0 = - OPCODE_AUIPC | (STATIC_CHAIN_REGNUM << SHIFT_RD); - OutChains[0] = - DAG.getTruncStore(Root, dl, DAG.getConstant(AUIPC_T2_0, dl, MVT::i64), - Addr, MachinePointerInfo(TrmpAddr), MVT::i32); - - const uint32_t LD_T0_TARGET_FUNCTION_OFFSET__T2_ = - OPCODE_LD | (RISCV_PROLOGUE_TEMP_REGNUM << SHIFT_RD) | - (STATIC_CHAIN_REGNUM << SHIFT_RS1) | - (target_function_offset << SHIFT_IMM); - Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(4, dl, MVT::i64)); - OutChains[1] = DAG.getTruncStore( - Root, dl, - DAG.getConstant(LD_T0_TARGET_FUNCTION_OFFSET__T2_, dl, MVT::i64), Addr, - MachinePointerInfo(TrmpAddr, 4), MVT::i32); - - const uint32_t LD_T2_STATIC_CHAIN_OFFSET__T2_ = - OPCODE_LD | (STATIC_CHAIN_REGNUM << SHIFT_RD) | - (STATIC_CHAIN_REGNUM << SHIFT_RS1) | (static_chain_offset << SHIFT_IMM); - Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(8, dl, MVT::i64)); - OutChains[2] = DAG.getTruncStore( - Root, dl, DAG.getConstant(LD_T2_STATIC_CHAIN_OFFSET__T2_, dl, MVT::i64), - Addr, MachinePointerInfo(TrmpAddr, 8), MVT::i32); - - const uint32_t JR_T0 = - OPCODE_JALR | (RISCV_PROLOGUE_TEMP_REGNUM << SHIFT_RS1); - Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(12, dl, MVT::i64)); - OutChains[3] = - DAG.getTruncStore(Root, dl, DAG.getConstant(JR_T0, dl, MVT::i64), Addr, - MachinePointerInfo(TrmpAddr, 12), MVT::i32); - - // Store static chain. - Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(static_chain_offset, dl, MVT::i64)); - OutChains[4] = DAG.getStore( - Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, static_chain_offset)); - - // Store function address. - Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(target_function_offset, dl, MVT::i64)); - OutChains[5] = - DAG.getStore(Root, dl, FPtr, Addr, - MachinePointerInfo(TrmpAddr, target_function_offset)); - - SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); - - // Now call __riscv_flush_icache - // void __riscv_flush_icache (void *start, void *end, unsigned long int flags) - TargetLowering::ArgListTy Args; - ArgListEntry Entry; + // We store in the trampoline buffer the following instructions and data. + // Offset: + // 0: auipc t2, 0 + // 4: ld t0, 24(t2) + // 8: ld t2, 16(t2) + // 12: jalr t0 + // 16: + // 24: + // 32: + + constexpr unsigned StaticChainOffset = 16; + constexpr unsigned FunctionAddressOffset = 24; + + const MCSubtargetInfo *STI = getTargetMachine().getMCSubtargetInfo(); + assert(STI); + auto GetEncoding = [&](const MCInst &MC) { + SmallVector CB; + SmallVector Fixups; + CodeEmitter->encodeInstruction(MC, CB, Fixups, *STI); + uint32_t Encoding = support::endian::read32le(CB.data()); + return Encoding; + }; - // start - Entry.Node = Trmp; - Entry.Ty = PointerType::getUnqual(*DAG.getContext()); - Args.push_back(Entry); + SDValue OutChains[6]; - // end - SDValue EndOfTrmp = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(32, dl, MVT::i64)); - Entry.Node = EndOfTrmp; - Entry.Ty = PointerType::getUnqual(*DAG.getContext()); - Args.push_back(Entry); + uint32_t Encodings[] = { + // auipc t2, 0 + // Loads the current PC into t2. + GetEncoding(MCInstBuilder(RISCV::AUIPC).addReg(RISCV::X7).addImm(0)), + // ld t0, 24(t2) + // Loads the function address into t0. Note that we are using offsets + // pc-relative to the first instruction of the trampoline. + GetEncoding( + MCInstBuilder(RISCV::LD).addReg(RISCV::X5).addReg(RISCV::X7).addImm( + FunctionAddressOffset)), + // ld t2, 16(t2) + // Load the value of the static chain. + GetEncoding( + MCInstBuilder(RISCV::LD).addReg(RISCV::X7).addReg(RISCV::X7).addImm( + StaticChainOffset)), + // jalr t0 + // Jump to the function. + GetEncoding(MCInstBuilder(RISCV::JALR) + .addReg(RISCV::X0) + .addReg(RISCV::X5) + .addImm(0))}; + + // Store encoded instructions. + for (auto [Idx, Encoding] : llvm::enumerate(Encodings)) { + SDValue Addr = Idx > 0 ? DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(Idx * 4, dl, MVT::i64)) + : Trmp; + OutChains[Idx] = DAG.getTruncStore( + Root, dl, DAG.getConstant(Encoding, dl, MVT::i64), Addr, + MachinePointerInfo(TrmpAddr, Idx * 4), MVT::i32); + } + + // Now store the variable part of the trampoline. + SDValue FunctionAddress = Op.getOperand(2); + SDValue StaticChain = Op.getOperand(3); + + // Store the given static chain and function pointer in the trampoline buffer. + struct OffsetValuePair { + const unsigned Offset; + const SDValue Value; + SDValue Addr = SDValue(); // Used to cache the address. + } OffsetValues[] = { + {StaticChainOffset, StaticChain}, + {FunctionAddressOffset, FunctionAddress}, + }; + for (auto [Idx, OffsetValue] : llvm::enumerate(OffsetValues)) { + SDValue Addr = + DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(OffsetValue.Offset, dl, MVT::i64)); + OffsetValue.Addr = Addr; + OutChains[Idx + 4] = + DAG.getStore(Root, dl, OffsetValue.Value, Addr, + MachinePointerInfo(TrmpAddr, OffsetValue.Offset)); + } - // flags - Entry.Node = DAG.getConstant(0, dl, MVT::i64); - Entry.Ty = Type::getInt64Ty(*DAG.getContext()); - Args.push_back(Entry); + SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); - TargetLowering::CallLoweringInfo CLI(DAG); - EVT Ty = getPointerTy(DAG.getDataLayout()); - CLI.setDebugLoc(dl).setChain(StoreToken).setLibCallee( - CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__riscv_flush_icache", Ty), std::move(Args)); + // The end of instructions of trampoline is the same as the static chain + // address that we computed earlier. + SDValue EndOfTrmp = OffsetValues[0].Addr; - std::pair CallResult = LowerCallTo(CLI); + // Call clear cache on the trampoline instructions. + SDValue Chain = DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken, + Trmp, EndOfTrmp); - return CallResult.second; + return Chain; } SDValue RISCVTargetLowering::lowerADJUST_TRAMPOLINE(SDValue Op, @@ -23125,10 +23126,8 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64; } Type *Tys[] = {AlignedAddr->getType()}; - Function *MaskedCmpXchg = - Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys); - Value *Result = Builder.CreateCall( - MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); + Value *Result = Builder.CreateIntrinsic( + CmpXchgIntrID, Tys, {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); if (XLen == 64) Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); return Result; @@ -23741,14 +23740,11 @@ bool RISCVTargetLowering::lowerInterleavedLoad( auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - LI->getModule(), FixedVlsegIntrIds[Factor - 2], - {VTy, LI->getPointerOperandType(), XLenTy}); - Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - CallInst *VlsegN = - Builder.CreateCall(VlsegNFunc, {LI->getPointerOperand(), VL}); + CallInst *VlsegN = Builder.CreateIntrinsic( + FixedVlsegIntrIds[Factor - 2], {VTy, LI->getPointerOperandType(), XLenTy}, + {LI->getPointerOperand(), VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -23842,11 +23838,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); if (auto *FVTy = dyn_cast(ResVTy)) { - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - LI->getModule(), FixedVlsegIntrIds[Factor - 2], - {ResVTy, LI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); - Return = Builder.CreateCall(VlsegNFunc, {LI->getPointerOperand(), VL}); + Return = + Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], + {ResVTy, LI->getPointerOperandType(), XLenTy}, + {LI->getPointerOperand(), VL}); } else { static const Intrinsic::ID IntrIds[] = { Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, @@ -23862,21 +23858,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( NumElts * SEW / 8), Factor); - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - LI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy}); Value *VL = Constant::getAllOnesValue(XLenTy); - Value *Vlseg = Builder.CreateCall( - VlsegNFunc, {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL, - ConstantInt::get(XLenTy, Log2_64(SEW))}); + Value *Vlseg = Builder.CreateIntrinsic( + IntrIds[Factor - 2], {VecTupTy, XLenTy}, + {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL, + ConstantInt::get(XLenTy, Log2_64(SEW))}); SmallVector AggrTypes{Factor, ResVTy}; Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - LI->getModule(), Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy}); for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {Vlseg, Builder.getInt32(i)}); + Value *VecExtract = Builder.CreateIntrinsic( + Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy}, + {Vlseg, Builder.getInt32(i)}); Return = Builder.CreateInsertValue(Return, VecExtract, i); } } @@ -23908,12 +23902,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); if (auto *FVTy = dyn_cast(InVTy)) { - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], - {InVTy, SI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); - Builder.CreateCall(VssegNFunc, {II->getArgOperand(0), II->getArgOperand(1), - SI->getPointerOperand(), VL}); + Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2], + {InVTy, SI->getPointerOperandType(), XLenTy}, + {II->getArgOperand(0), II->getArgOperand(1), + SI->getPointerOperand(), VL}); } else { static const Intrinsic::ID IntrIds[] = { Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, @@ -23934,13 +23927,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Value *VL = Constant::getAllOnesValue(XLenTy); - Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}); Value *StoredVal = PoisonValue::get(VecTupTy); for (unsigned i = 0; i < Factor; ++i) - StoredVal = - Builder.CreateCall(VecInsertFunc, {StoredVal, II->getArgOperand(i), - Builder.getInt32(i)}); + StoredVal = Builder.CreateIntrinsic( + Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}, + {StoredVal, II->getArgOperand(i), Builder.getInt32(i)}); Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL, ConstantInt::get(XLenTy, Log2_64(SEW))}); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 8b79b62a3fc5bc3d07485e572459f9ef0b531e09..b9f785f7dcdc9908504c6a50047e13d6f959ca72 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4190,3 +4190,17 @@ unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) { assert(Scaled >= 3 && Scaled <= 6); return Scaled; } + +/// Given two VL operands, do we know that LHS <= RHS? +bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) { + if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() && + LHS.getReg() == RHS.getReg()) + return true; + if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel) + return true; + if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel) + return false; + if (!LHS.isImm() || !RHS.isImm()) + return false; + return LHS.getImm() <= RHS.getImm(); +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 542e9b0b7d844d1e1dee29b7008d36d15a071b67..0f3f66b27657b73965e02fdbd8f2a3e5c313d0ed 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -346,6 +346,9 @@ unsigned getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW); // Special immediate for AVL operand of V pseudo instructions to indicate VLMax. static constexpr int64_t VLMaxSentinel = -1LL; +/// Given two VL operands, do we know that LHS <= RHS? +bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS); + // Mask assignments for floating-point static constexpr unsigned FPMASK_Negative_Infinity = 0x001; static constexpr unsigned FPMASK_Negative_Normal = 0x002; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index e8c4860fd3e55ee707a6e6584d7bf4534c9a7ae3..8a76dba23d42a7612f44477917773af2968c1e5b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -418,15 +418,11 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), let Inst{6-2} = imm{4-0}; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_ADDI_NOP : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb), - (ins GPRX0:$rd, immzero:$imm), - "c.addi", "$rd, $imm">, - Sched<[WriteIALU, ReadIALU]> { - let Constraints = "$rd = $rd_wb"; - let Inst{6-2} = 0; - let isAsmParserOnly = 1; -} +// Alternate syntax for c.nop. Converted to C_NOP by the assembler. +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, + isAsmParserOnly = 1 in +def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm), + [], "c.addi", "$rd, $imm">; let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1, DecoderNamespace = "RISCV32Only_", Defs = [X1], diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 3d68f6014f9ceba62f2993204ba38c0c6855ce5b..864c589027009594fe484cc7ac12387a9f56944c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1292,6 +1292,10 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE, CostKind); } + case Intrinsic::vp_merge: + return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(), + ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE, + CostKind); } if (ST->hasVInstructions() && RetTy->isVectorTy()) { diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 088f6d62dcbe78d826458fa0b0c887a15948063a..2141a6ed1ed758b5e58bf2e91a6a0b8b16b79493 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -51,7 +51,7 @@ public: StringRef getPassName() const override { return PASS_NAME; } private: - bool checkUsers(std::optional &CommonVL, MachineInstr &MI); + bool checkUsers(const MachineOperand *&CommonVL, MachineInstr &MI); bool tryReduceVL(MachineInstr &MI); bool isCandidate(const MachineInstr &MI) const; }; @@ -114,10 +114,10 @@ struct OperandInfo { return; } assert(EMUL && "Expected EMUL to have value"); - OS << "EMUL: "; + OS << "EMUL: m"; if (EMUL->second) - OS << "m"; - OS << "f" << EMUL->first; + OS << "f"; + OS << EMUL->first; OS << ", EEW: " << (1 << Log2EEW); } }; @@ -563,7 +563,12 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VREM_VV: case RISCV::VREM_VX: // Vector Widening Integer Multiply Instructions - // FIXME: Add support + case RISCV::VWMUL_VV: + case RISCV::VWMUL_VX: + case RISCV::VWMULSU_VV: + case RISCV::VWMULSU_VX: + case RISCV::VWMULU_VV: + case RISCV::VWMULU_VX: // Vector Single-Width Integer Multiply-Add Instructions // FIXME: Add support // Vector Widening Integer Multiply-Add Instructions @@ -653,10 +658,34 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { if (MI.getNumDefs() != 1) return false; + // If we're not using VLMAX, then we need to be careful whether we are using + // TA/TU when there is a non-undef Passthru. But when we are using VLMAX, it + // does not matter whether we are using TA/TU with a non-undef Passthru, since + // there are no tail elements to be perserved. unsigned VLOpNum = RISCVII::getVLOpNum(Desc); const MachineOperand &VLOp = MI.getOperand(VLOpNum); - if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel) + if (VLOp.isReg() || VLOp.getImm() != RISCV::VLMaxSentinel) { + // If MI has a non-undef passthru, we will not try to optimize it since + // that requires us to preserve tail elements according to TA/TU. + // Otherwise, The MI has an undef Passthru, so it doesn't matter whether we + // are using TA/TU. + bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc); + unsigned PassthruOpIdx = MI.getNumExplicitDefs(); + if (HasPassthru && + MI.getOperand(PassthruOpIdx).getReg() != RISCV::NoRegister) { + LLVM_DEBUG( + dbgs() << " Not a candidate because it uses non-undef passthru" + " with non-VLMAX VL\n"); + return false; + } + } + + // If the VL is 1, then there is no need to reduce it. This is an + // optimization, not needed to preserve correctness. + if (VLOp.isImm() && VLOp.getImm() == 1) { + LLVM_DEBUG(dbgs() << " Not a candidate because VL is already 1\n"); return false; + } // Some instructions that produce vectors have semantics that make it more // difficult to determine whether the VL can be reduced. For example, some @@ -679,7 +708,7 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return true; } -bool RISCVVLOptimizer::checkUsers(std::optional &CommonVL, +bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL, MachineInstr &MI) { // FIXME: Avoid visiting each user for each time we visit something on the // worklist, combined with an extra visit from the outer loop. Restructure @@ -725,16 +754,17 @@ bool RISCVVLOptimizer::checkUsers(std::optional &CommonVL, unsigned VLOpNum = RISCVII::getVLOpNum(Desc); const MachineOperand &VLOp = UserMI.getOperand(VLOpNum); - // Looking for a register VL that isn't X0. - if (!VLOp.isReg() || VLOp.getReg() == RISCV::X0) { - LLVM_DEBUG(dbgs() << " Abort due to user uses X0 as VL.\n"); - CanReduceVL = false; - break; - } + + // Looking for an immediate or a register VL that isn't X0. + assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) && + "Did not expect X0 VL"); if (!CommonVL) { - CommonVL = VLOp.getReg(); - } else if (*CommonVL != VLOp.getReg()) { + CommonVL = &VLOp; + LLVM_DEBUG(dbgs() << " User VL is: " << VLOp << "\n"); + } else if (!CommonVL->isIdenticalTo(VLOp)) { + // FIXME: This check requires all users to have the same VL. We can relax + // this and get the largest VL amongst all users. LLVM_DEBUG(dbgs() << " Abort because users have different VL\n"); CanReduceVL = false; break; @@ -771,7 +801,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) { MachineInstr &MI = *Worklist.pop_back_val(); LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n"); - std::optional CommonVL; + const MachineOperand *CommonVL = nullptr; bool CanReduceVL = true; if (isVectorRegClass(MI.getOperand(0).getReg(), MRI)) CanReduceVL = checkUsers(CommonVL, MI); @@ -779,21 +809,34 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) { if (!CanReduceVL || !CommonVL) continue; - if (!CommonVL->isVirtual()) { - LLVM_DEBUG( - dbgs() << " Abort due to new VL is not virtual register.\n"); + assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) && + "Expected VL to be an Imm or virtual Reg"); + + unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc()); + MachineOperand &VLOp = MI.getOperand(VLOpNum); + + if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) { + LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n"); continue; } - const MachineInstr *VLMI = MRI->getVRegDef(*CommonVL); - if (!MDT->dominates(VLMI, &MI)) - continue; + if (CommonVL->isImm()) { + LLVM_DEBUG(dbgs() << " Reduce VL from " << VLOp << " to " + << CommonVL->getImm() << " for " << MI << "\n"); + VLOp.ChangeToImmediate(CommonVL->getImm()); + } else { + const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg()); + if (!MDT->dominates(VLMI, &MI)) + continue; + LLVM_DEBUG( + dbgs() << " Reduce VL from " << VLOp << " to " + << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo()) + << " for " << MI << "\n"); + + // All our checks passed. We can reduce VL. + VLOp.ChangeToRegister(CommonVL->getReg(), false); + } - // All our checks passed. We can reduce VL. - LLVM_DEBUG(dbgs() << " Reducing VL for: " << MI << "\n"); - unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc()); - MachineOperand &VLOp = MI.getOperand(VLOpNum); - VLOp.ChangeToRegister(*CommonVL, false); MadeChange = true; // Now add all inputs to this instruction to the worklist. diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index b883c50beadc095af721370131a1f97ff3b01cdd..a57bc5a3007d03b407bb11a5277a3d0effc46b84 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -86,20 +86,6 @@ char RISCVVectorPeephole::ID = 0; INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false, false) -/// Given two VL operands, do we know that LHS <= RHS? -static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) { - if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() && - LHS.getReg() == RHS.getReg()) - return true; - if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel) - return true; - if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel) - return false; - if (!LHS.isImm() || !RHS.isImm()) - return false; - return LHS.getImm() <= RHS.getImm(); -} - /// Given \p User that has an input operand with EEW=SEW, which uses the dest /// operand of \p Src with an unknown EEW, return true if their EEWs match. bool RISCVVectorPeephole::hasSameEEW(const MachineInstr &User, @@ -191,7 +177,7 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const { return false; MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc())); - if (VL.isIdenticalTo(SrcVL) || !isVLKnownLE(VL, SrcVL)) + if (VL.isIdenticalTo(SrcVL) || !RISCV::isVLKnownLE(VL, SrcVL)) return false; if (!ensureDominates(VL, *Src)) @@ -580,7 +566,7 @@ bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) { MachineOperand &SrcPolicy = Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())); - if (isVLKnownLE(MIVL, SrcVL)) + if (RISCV::isVLKnownLE(MIVL, SrcVL)) SrcPolicy.setImm(SrcPolicy.getImm() | RISCVII::TAIL_AGNOSTIC); } @@ -631,7 +617,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { // so we don't need to handle a smaller source VL here. However, the // user's VL may be larger MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc())); - if (!isVLKnownLE(SrcVL, MI.getOperand(3))) + if (!RISCV::isVLKnownLE(SrcVL, MI.getOperand(3))) return false; // If the new passthru doesn't dominate Src, try to move Src so it does. @@ -650,7 +636,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { // If MI was tail agnostic and the VL didn't increase, preserve it. int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED; if ((MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) && - isVLKnownLE(MI.getOperand(3), SrcVL)) + RISCV::isVLKnownLE(MI.getOperand(3), SrcVL)) Policy |= RISCVII::TAIL_AGNOSTIC; Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy); diff --git a/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h b/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h index b98f8d0928e5b73e59b62ad1485e0ec26d311603..c7522554166a7eeea0630738e72c797f28f7cb84 100644 --- a/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h +++ b/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h @@ -33,7 +33,8 @@ public: void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override {} - bool hasFP(const MachineFunction &MF) const override { return false; } +protected: + bool hasFPImpl(const MachineFunction &MF) const override { return false; } }; } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVFRAMELOWERING_H diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index fd92346717c415381f397584184251e1c28b0ba3..d9377fe4b91a1adeffab941ac114fa7e98153c48 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -230,6 +230,9 @@ private: bool selectSpvThreadId(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectWaveReadLaneAt(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + bool selectUnmergeValues(MachineInstr &I) const; void selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType, @@ -417,6 +420,7 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, case TargetOpcode::G_INTRINSIC: case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + case TargetOpcode::G_INTRINSIC_CONVERGENT: case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: return selectIntrinsic(ResVReg, ResType, I); case TargetOpcode::G_BITREVERSE: @@ -1758,6 +1762,26 @@ bool SPIRVInstructionSelector::selectSign(Register ResVReg, return Result; } +bool SPIRVInstructionSelector::selectWaveReadLaneAt(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + assert(I.getNumOperands() == 4); + assert(I.getOperand(2).isReg()); + assert(I.getOperand(3).isReg()); + MachineBasicBlock &BB = *I.getParent(); + + // IntTy is used to define the execution scope, set to 3 to denote a + // cross-lane interaction equivalent to a SPIR-V subgroup. + SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII); + return BuildMI(BB, I, I.getDebugLoc(), + TII.get(SPIRV::OpGroupNonUniformShuffle)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(GR.getOrCreateConstInt(3, I, IntTy, TII)) + .addUse(I.getOperand(2).getReg()) + .addUse(I.getOperand(3).getReg()); +} + bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { @@ -2543,6 +2567,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, .addUse(GR.getSPIRVTypeID(ResType)) .addUse(GR.getOrCreateConstInt(3, I, IntTy, TII)); } + case Intrinsic::spv_wave_readlane: + return selectWaveReadLaneAt(ResVReg, ResType, I); case Intrinsic::spv_step: return selectExtInst(ResVReg, ResType, I, CL::step, GL::Step); case Intrinsic::spv_radians: diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp index 000418be9a9e3313566d2b7071b92af683c0648c..fa38c6cbb6ebbf759af7fa5124dc692af1bfa5fe 100644 --- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp @@ -249,10 +249,10 @@ bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo().hasVarSizedObjects(); } -// hasFP - Return true if the specified function should have a dedicated frame -// pointer register. This is true if the function has variable sized allocas or -// if frame pointer elimination is disabled. -bool SparcFrameLowering::hasFP(const MachineFunction &MF) const { +// hasFPImpl - Return true if the specified function should have a dedicated +// frame pointer register. This is true if the function has variable sized +// allocas or if frame pointer elimination is disabled. +bool SparcFrameLowering::hasFPImpl(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.h b/llvm/lib/Target/Sparc/SparcFrameLowering.h index ab0ceb6591c63cb9f329416553d890e8036b06aa..803856811969b262d13c561ec67f54a8bd0e8acf 100644 --- a/llvm/lib/Target/Sparc/SparcFrameLowering.h +++ b/llvm/lib/Target/Sparc/SparcFrameLowering.h @@ -35,7 +35,6 @@ public: MachineBasicBlock::iterator I) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; - bool hasFP(const MachineFunction &MF) const override; void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; @@ -47,6 +46,9 @@ public: /// time). bool targetHandlesStackFrameRounding() const override { return true; } +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: // Remap input registers to output registers for leaf procedure. void remapRegsForLeafProc(MachineFunction &MF) const; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 42b8248006d1fdcdfeff58344ad7ac9dbfc68233..de4986ef1e89e463c1da629b0d1f4404a2592a49 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -3548,9 +3548,9 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N, } // Override to enable LOAD_STACK_GUARD lowering on Linux. -bool SparcTargetLowering::useLoadStackGuardNode() const { +bool SparcTargetLowering::useLoadStackGuardNode(const Module &M) const { if (!Subtarget->isTargetLinux()) - return TargetLowering::useLoadStackGuardNode(); + return TargetLowering::useLoadStackGuardNode(M); return true; } diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h index 15d09bc9309754bf4a2d83a8bcad7dc0e68b6951..cc672074a4be804bf06f44c059cefbc0412f5916 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -119,7 +119,7 @@ namespace llvm { } /// Override to support customized stack guard loading. - bool useLoadStackGuardNode() const override; + bool useLoadStackGuardNode(const Module &M) const override; void insertSSPDeclarations(Module &M) const override; /// getSetCCResultType - Return the ISD::SETCC ValueType diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 8c53b8dffc2fa60c27ccd66503efd31ab02bc6ec..8fbd05eab5f6ee2abe654b44aebbb2567f2a4db3 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -832,7 +832,7 @@ void SystemZELFFrameLowering::inlineStackProbe( } } -bool SystemZELFFrameLowering::hasFP(const MachineFunction &MF) const { +bool SystemZELFFrameLowering::hasFPImpl(const MachineFunction &MF) const { return (MF.getTarget().Options.DisableFramePointerElim(MF) || MF.getFrameInfo().hasVarSizedObjects()); } @@ -1449,7 +1449,12 @@ void SystemZXPLINKFrameLowering::inlineStackProbe( fullyRecomputeLiveIns({StackExtMBB, NextMBB}); } -bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const { +bool SystemZXPLINKFrameLowering::hasFPImpl(const MachineFunction &MF) const { + // Naked functions have no stack frame pushed, so we don't have a frame + // pointer. + if (MF.getFunction().hasFnAttribute(Attribute::Naked)) + return false; + return (MF.getFrameInfo().hasVarSizedObjects()); } diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index c4367b491f99efc5dd510d9c13b5c847c0183935..57fc73b78bbf7c5b89afce28dd5928258fceef57 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -86,7 +86,6 @@ public: void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const override; - bool hasFP(const MachineFunction &MF) const override; StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override; void @@ -113,6 +112,9 @@ public: // Get or create the frame index of where the old frame pointer is stored. int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; class SystemZXPLINKFrameLowering : public SystemZFrameLowering { @@ -147,8 +149,6 @@ public: void inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const override; - bool hasFP(const MachineFunction &MF) const override; - void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; @@ -167,6 +167,9 @@ public: // Get or create the frame index of where the old frame pointer is stored. int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index c7626434efac674e2c0aa5051be40899bd869489..83417e570dabf735793c0a4bcb55f8b08e3d3b56 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -9863,7 +9863,7 @@ verifyNarrowIntegerArgs_Call(const SmallVectorImpl &Outs, if (CalleeFn != nullptr) printFunctionArgExts(CalleeFn, errs()); else - errs() << "-"; + errs() << "-\n"; errs() << "Caller: "; printFunctionArgExts(F, errs()); llvm_unreachable(""); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 2b065245c16f205829fadc89a7792ba6a9af7e3d..3c06c1fdf2b1bcac6eaddda9ce7694cab9197b50 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -570,9 +570,7 @@ public: getExceptionSelectorRegister(const Constant *PersonalityFn) const override; /// Override to support customized stack guard loading. - bool useLoadStackGuardNode() const override { - return true; - } + bool useLoadStackGuardNode(const Module &M) const override { return true; } void insertSSPDeclarations(Module &M) const override { } diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp index 345327e880ecd5d34e542bd7793bd82a08b7a2a5..c351c31b0a796f5f711c1e2e80bd91b749161e99 100644 --- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp @@ -366,11 +366,10 @@ bool SystemZTDCPass::runOnFunction(Function &F) { if (!Worthy) continue; // Call the intrinsic, compare result with 0. - Function *TDCFunc = Intrinsic::getOrInsertDeclaration( - &M, Intrinsic::s390_tdc, V->getType()); IRBuilder<> IRB(I); Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask); - Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal}); + Instruction *TDC = + IRB.CreateIntrinsic(Intrinsic::s390_tdc, V->getType(), {V, MaskVal}); Value *ICmp = IRB.CreateICmp(CmpInst::ICMP_NE, TDC, Zero32); I->replaceAllUsesWith(ICmp); } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp index 53ed46f14f14dcc93501b942ed82642bd157aac1..f76f41768e886dcc05ac417b08b88bdf81299c7a 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -257,7 +257,7 @@ bool SystemZPassConfig::addInstSelector() { } bool SystemZPassConfig::addILPOpts() { - addPass(&EarlyIfConverterID); + addPass(&EarlyIfConverterLegacyID); if (EnableMachineCombinerPass) addPass(&MachineCombinerID); diff --git a/llvm/lib/Target/VE/VEFrameLowering.cpp b/llvm/lib/Target/VE/VEFrameLowering.cpp index 195bd4e6c3aee7ebd46497f55b2d5320d1957ae2..10e94c28072fdaa563134d068797cff7ae4cc2e2 100644 --- a/llvm/lib/Target/VE/VEFrameLowering.cpp +++ b/llvm/lib/Target/VE/VEFrameLowering.cpp @@ -415,10 +415,10 @@ void VEFrameLowering::emitEpilogue(MachineFunction &MF, emitEpilogueInsns(MF, MBB, MBBI, NumBytes, true); } -// hasFP - Return true if the specified function should have a dedicated frame -// pointer register. This is true if the function has variable sized allocas -// or if frame pointer elimination is disabled. -bool VEFrameLowering::hasFP(const MachineFunction &MF) const { +// hasFPImpl - Return true if the specified function should have a dedicated +// frame pointer register. This is true if the function has variable sized +// allocas or if frame pointer elimination is disabled. +bool VEFrameLowering::hasFPImpl(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/VE/VEFrameLowering.h b/llvm/lib/Target/VE/VEFrameLowering.h index 36fc8b201b648cbb660ea5e3ac0c0b2e5a190d7c..be9cdc01d6f446b79660917b9d2d72b7343bebcc 100644 --- a/llvm/lib/Target/VE/VEFrameLowering.h +++ b/llvm/lib/Target/VE/VEFrameLowering.h @@ -39,7 +39,6 @@ public: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; - bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; bool hasGOT(const MachineFunction &MF) const; @@ -69,6 +68,8 @@ public: protected: const VESubtarget &STI; + bool hasFPImpl(const MachineFunction &MF) const override; + private: // Returns true if MF is a leaf procedure. bool isLeafProc(MachineFunction &MF) const; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index 8f3ad167ae41fcfbee883e84e1dda5bb8293ef78..f0334ccb3afcb598ad8a7556e7a38d6dc4178cd6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -98,7 +98,7 @@ bool WebAssemblyFrameLowering::hasBP(const MachineFunction &MF) const { /// Return true if the specified function should have a dedicated frame pointer /// register. -bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const { +bool WebAssemblyFrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); // When we have var-sized objects, we move the stack pointer by an unknown diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h index 528b33e34beeef5d3d87ddd5dec145751a6a9fd5..710d5173d64dba11618982e1534601aa80c948dc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h @@ -41,7 +41,6 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; bool isSupportedStackID(TargetStackID::Value ID) const override; DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const override; @@ -68,6 +67,9 @@ public: static unsigned getOpcGlobGet(const MachineFunction &MF); static unsigned getOpcGlobSet(const MachineFunction &MF); +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: bool hasBP(const MachineFunction &MF) const; bool needsSPForLocalFrame(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index ba3ab5164af267b816ea69fef8edb1a59545635e..aaa52256707210bd711befb284f52d29741fbe80 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -213,6 +213,9 @@ struct RuntimeLibcallSignatureTable { Table[RTLIB::ATAN_F32] = f32_func_f32; Table[RTLIB::ATAN_F64] = f64_func_f64; Table[RTLIB::ATAN_F128] = i64_i64_func_i64_i64; + Table[RTLIB::ATAN2_F32] = f32_func_f32_f32; + Table[RTLIB::ATAN2_F64] = f64_func_f64_f64; + Table[RTLIB::ATAN2_F128] = i64_i64_func_i64_i64_i64_i64; Table[RTLIB::SINH_F32] = f32_func_f32; Table[RTLIB::SINH_F64] = f64_func_f64; Table[RTLIB::SINH_F128] = i64_i64_func_i64_i64; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 4f83267c999e4ac59950eae909929e9600625826..a35b04606e595d1f22cf83c5543e6a0b75962b0f 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -91,10 +91,10 @@ bool X86FrameLowering::needsFrameIndexResolution( MF.getInfo()->getHasPushSequences(); } -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register. This is true if the function has variable sized allocas -/// or if frame pointer elimination is disabled. -bool X86FrameLowering::hasFP(const MachineFunction &MF) const { +/// hasFPImpl - Return true if the specified function should have a dedicated +/// frame pointer register. This is true if the function has variable sized +/// allocas or if frame pointer elimination is disabled. +bool X86FrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || TRI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() || diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 78217911dacadf9a7c2ae7b2d14e12f1e905e431..02fe8ee02a7e45e965d9e556c07e67f828c6de93 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -105,7 +105,6 @@ public: void spillFPBP(MachineFunction &MF) const override; - bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; bool needsFrameIndexResolution(const MachineFunction &MF) const override; @@ -201,6 +200,9 @@ public: /// frame of the top of stack function) as part of it's ABI. bool has128ByteRedZone(const MachineFunction& MF) const; +protected: + bool hasFPImpl(const MachineFunction &MF) const override; + private: bool isWin64Prologue(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5b4b27c888952c1bfec6f2f1e2c8a52f130ed837..102789a3e9521f45fe731f9ba5b30e7863dcbcf0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -858,6 +858,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FASIN , MVT::f80, Expand); setOperationAction(ISD::FACOS , MVT::f80, Expand); setOperationAction(ISD::FATAN , MVT::f80, Expand); + setOperationAction(ISD::FATAN2 , MVT::f80, Expand); setOperationAction(ISD::FSINH , MVT::f80, Expand); setOperationAction(ISD::FCOSH , MVT::f80, Expand); setOperationAction(ISD::FTANH , MVT::f80, Expand); @@ -2562,6 +2563,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, {ISD::FACOS, ISD::STRICT_FACOS, ISD::FASIN, ISD::STRICT_FASIN, ISD::FATAN, ISD::STRICT_FATAN, + ISD::FATAN2, ISD::STRICT_FATAN2, ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FCOSH, ISD::STRICT_FCOSH, @@ -2680,7 +2682,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // This has so far only been implemented for 64-bit MachO. -bool X86TargetLowering::useLoadStackGuardNode() const { +bool X86TargetLowering::useLoadStackGuardNode(const Module &M) const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } @@ -31188,7 +31190,6 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { LLVMContext &Ctx = AI->getContext(); Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), PointerType::getUnqual(Ctx)); - Function *BitTest = nullptr; Value *Result = nullptr; auto BitTested = FindSingleBitChange(AI->getValOperand()); assert(BitTested.first != nullptr); @@ -31196,15 +31197,10 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) { auto *C = cast(I->getOperand(I->getOperand(0) == AI ? 1 : 0)); - BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_C, - AI->getType()); - unsigned Imm = llvm::countr_zero(C->getZExtValue()); - Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); + Result = Builder.CreateIntrinsic(IID_C, AI->getType(), + {Addr, Builder.getInt8(Imm)}); } else { - BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_I, - AI->getType()); - assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit); Value *SI = BitTested.first; @@ -31221,7 +31217,7 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in // favor of just a raw BT{S|R|C}. - Result = Builder.CreateCall(BitTest, {Addr, BitPos}); + Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos}); Result = Builder.CreateZExtOrTrunc(Result, AI->getType()); // If the result is only used for zero/non-zero status then we don't need to @@ -31362,12 +31358,11 @@ void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic( IID = Intrinsic::x86_atomic_xor_cc; break; } - Function *CmpArith = - Intrinsic::getOrInsertDeclaration(AI->getModule(), IID, AI->getType()); Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), PointerType::getUnqual(Ctx)); - Value *Call = Builder.CreateCall( - CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)}); + Value *Call = Builder.CreateIntrinsic( + IID, AI->getType(), + {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)}); Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx)); ICI->replaceAllUsesWith(Result); ICI->eraseFromParent(); @@ -52743,8 +52738,8 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) { KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1)); if (XORRHS.isConstant()) { - APInt ConjugationInt32 = APInt(32, 0x80000000, true); - APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true); + APInt ConjugationInt32 = APInt(32, 0x80000000); + APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL); if ((XORRHS.getBitWidth() == 32 && XORRHS.getConstant() == ConjugationInt32) || (XORRHS.getBitWidth() == 64 && @@ -52783,7 +52778,7 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, Flags.hasNoSignedZeros(); }; auto IsVectorAllNegativeZero = [&DAG](SDValue Op) { - APInt AI = APInt(32, 0x80008000, true); + APInt AI = APInt(32, 0x80008000); KnownBits Bits = DAG.computeKnownBits(Op); return Bits.getBitWidth() == 32 && Bits.isConstant() && Bits.getConstant() == AI; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 3b1bd0ad9a267ed83f3915de95c99821add58b72..14ada1721fd40e2b06f52eff19fdba9f13b04d96 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1568,7 +1568,7 @@ namespace llvm { /// returns the address of that location. Otherwise, returns nullptr. Value *getIRStackGuard(IRBuilderBase &IRB) const override; - bool useLoadStackGuardNode() const override; + bool useLoadStackGuardNode(const Module &M) const override; bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 8561658379f7e48066e7ad51c1848a7e9e9e6c43..12cd92e2d0d773610c43e432f22fee05c3d01df3 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2856,6 +2856,13 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( return false; } + // The stack frame of the caller cannot be replaced by the tail-callee one's + // if the function is required to preserve all the registers. Conservatively + // prevent tail optimization even if hypothetically all the registers are used + // for passing formal parameters or returning values. + if (CallerF.hasFnAttribute("no_caller_saved_registers")) + return false; + unsigned StackArgsSize = CCInfo.getStackSize(); // If the callee takes no arguments then go on to check the results of the diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index c4374984da4b9eaa514b0dfa936491be89d389ae..7c9738bf0821647619c662a287fe0cf085464124 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -1875,10 +1875,7 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { Value *Args[] = {Op0, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_extrqi); - return Builder.CreateCall(F, Args); + return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, {}, Args); } } @@ -1975,10 +1972,7 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); Value *Args[] = {Op0, Op1, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return Builder.CreateCall(F, Args); + return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, {}, Args); } return nullptr; diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index ceb87a62dba01140c9ff95cfec88ab18cc78363f..4ba0ac11d209532b03ff4fe15793040eb98446c3 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -536,7 +536,7 @@ bool X86PassConfig::addGlobalInstructionSelect() { } bool X86PassConfig::addILPOpts() { - addPass(&EarlyIfConverterID); + addPass(&EarlyIfConverterLegacyID); if (EnableMachineCombinerPass) addPass(&MachineCombinerID); addPass(createX86CmovConverterPass()); diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index 05fc6f13129f24e7a225c1c60419475e04cb3f7a..bc9fd801f94b2245902b5c2ac37216684bfb91cc 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -333,12 +333,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { // If using _except_handler4, the EHGuard contains: FramePtr xor Cookie. if (UseStackGuard) { Value *Val = Builder.CreateLoad(Int32Ty, Cookie); - Value *FrameAddr = Builder.CreateCall( - Intrinsic::getOrInsertDeclaration( - TheModule, Intrinsic::frameaddress, - Builder.getPtrTy( - TheModule->getDataLayout().getAllocaAddrSpace())), - Builder.getInt32(0), "frameaddr"); + Value *FrameAddr = Builder.CreateIntrinsic( + Intrinsic::frameaddress, + Builder.getPtrTy(TheModule->getDataLayout().getAllocaAddrSpace()), + Builder.getInt32(0), /*FMFSource=*/nullptr, "frameaddr"); Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty); FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val); Builder.CreateStore(FrameAddrI32, EHGuardNode); @@ -369,8 +367,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { } Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) { - return Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(TheModule, Intrinsic::x86_seh_lsda), F); + return Builder.CreateIntrinsic(Intrinsic::x86_seh_lsda, {}, F); } /// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls @@ -624,17 +621,13 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { // that it can recover the original frame pointer. IRBuilder<> Builder(RegNode->getNextNode()); Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy()); - Builder.CreateCall(Intrinsic::getOrInsertDeclaration( - TheModule, Intrinsic::x86_seh_ehregnode), - {RegNodeI8}); + Builder.CreateIntrinsic(Intrinsic::x86_seh_ehregnode, {}, {RegNodeI8}); if (EHGuardNode) { IRBuilder<> Builder(EHGuardNode->getNextNode()); Value *EHGuardNodeI8 = Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy()); - Builder.CreateCall(Intrinsic::getOrInsertDeclaration( - TheModule, Intrinsic::x86_seh_ehguard), - {EHGuardNodeI8}); + Builder.CreateIntrinsic(Intrinsic::x86_seh_ehguard, {}, {EHGuardNodeI8}); } // Calculate state numbers. diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index b3753692ac2a05422551b9ef574a850dc9bb0207..ec18eca82b52d174e362c73f9732ba700807f58e 100644 --- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -215,7 +215,7 @@ XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti) // Do nothing } -bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const { +bool XCoreFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MF.getTarget().Options.DisableFramePointerElim(MF) || MF.getFrameInfo().hasVarSizedObjects(); } diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.h b/llvm/lib/Target/XCore/XCoreFrameLowering.h index a914d82e198947cdfc1b817d9317f7abf5ee1ada..b06a6f922cdde03a46e96a07a0258cc539fdfa41 100644 --- a/llvm/lib/Target/XCore/XCoreFrameLowering.h +++ b/llvm/lib/Target/XCore/XCoreFrameLowering.h @@ -46,8 +46,6 @@ namespace llvm { eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; - bool hasFP(const MachineFunction &MF) const override; - void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; @@ -58,6 +56,9 @@ namespace llvm { static int stackSlotSize() { return 4; } + + protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 3604774ddf35bfadd402f862cc1d228c0b8e52c2..62461d68ca1581169d66c33c0546748d9032a296 100644 --- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -157,9 +157,7 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) { for (User *U : Users) { Instruction *Inst = cast(U); IRBuilder<> Builder(Inst); - Function *GetID = Intrinsic::getOrInsertDeclaration(GV->getParent(), - Intrinsic::xcore_getid); - Value *ThreadID = Builder.CreateCall(GetID, {}); + Value *ThreadID = Builder.CreateIntrinsic(Intrinsic::xcore_getid, {}, {}); Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV, {Builder.getInt64(0), ThreadID}); U->replaceUsesOfWith(GV, Addr); diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp index e24cb7714d364641cde0b584717bbf9a0dc70e41..f46d386c9186aac75a64ce99978e280aefd31e92 100644 --- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp @@ -27,7 +27,7 @@ XtensaFrameLowering::XtensaFrameLowering(const XtensaSubtarget &STI) Align(4)), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {} -bool XtensaFrameLowering::hasFP(const MachineFunction &MF) const { +bool XtensaFrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); return MF.getTarget().Options.DisableFramePointerElim(MF) || MFI.hasVarSizedObjects(); diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h index 9120215af08b528c0720e9bd15efc5c4c230f30b..3f946e1ea730f98cb8faaea333e74216d23c0053 100644 --- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h +++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h @@ -24,8 +24,6 @@ class XtensaFrameLowering : public TargetFrameLowering { public: XtensaFrameLowering(const XtensaSubtarget &STI); - bool hasFP(const MachineFunction &MF) const override; - /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. void emitPrologue(MachineFunction &, MachineBasicBlock &) const override; @@ -50,6 +48,9 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; + +protected: + bool hasFPImpl(const MachineFunction &MF) const override; }; } // namespace llvm diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 898d55fab2b00d5ec1c9107bda2c3ab1132f55a9..b5b561797f75ab9a33e72c5ac9648baca720b8e1 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -172,9 +172,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { // %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ] // --> // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt) - Function *F = - Intrinsic::getOrInsertDeclaration(Phi.getModule(), IID, Phi.getType()); - Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt})); + Phi.replaceAllUsesWith( + Builder.CreateIntrinsic(IID, Phi.getType(), {ShVal0, ShVal1, ShAmt})); return true; } @@ -332,9 +331,8 @@ static bool tryToRecognizePopCount(Instruction &I) { m_SpecificInt(Mask55)))) { LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n"); IRBuilder<> Builder(&I); - Function *Func = Intrinsic::getOrInsertDeclaration( - I.getModule(), Intrinsic::ctpop, I.getType()); - I.replaceAllUsesWith(Builder.CreateCall(Func, {Root})); + I.replaceAllUsesWith( + Builder.CreateIntrinsic(Intrinsic::ctpop, I.getType(), {Root})); ++NumPopCountRecognized; return true; } @@ -399,9 +397,8 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) { return false; IRBuilder<> Builder(&I); - Function *Fn = Intrinsic::getOrInsertDeclaration( - I.getModule(), Intrinsic::fptosi_sat, {SatTy, FpTy}); - Value *Sat = Builder.CreateCall(Fn, In); + Value *Sat = + Builder.CreateIntrinsic(Intrinsic::fptosi_sat, {SatTy, FpTy}, In); I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy)); return true; } @@ -412,9 +409,6 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) { static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT) { - - Module *M = Call->getModule(); - // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created // (because NNAN or the operand arg must not be less than -0.0) and (2) we // would not end up lowering to a libcall anyway (which could change the value @@ -432,8 +426,8 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, IRBuilderBase::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(Call->getFastMathFlags()); - Function *Sqrt = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sqrt, Ty); - Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt"); + Value *NewSqrt = Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg, + /*FMFSource=*/nullptr, "sqrt"); Call->replaceAllUsesWith(NewSqrt); // Explicitly erase the old call because a call with side effects is not diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp index c3ea0977d42117fe6b93017dc7d9c2e4a0dbebe4..6327cea64c0de6285b15a5ec9b14f7d1e64b362d 100644 --- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp @@ -15,6 +15,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/ModuleSlotTracker.h" #include "llvm/Transforms/Coroutines/SpillUtils.h" #include @@ -104,19 +105,25 @@ struct RematGraph { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - static std::string getBasicBlockLabel(const BasicBlock *BB) { - if (BB->hasName()) - return BB->getName().str(); - - std::string S; - raw_string_ostream OS(S); - BB->printAsOperand(OS, false); - return OS.str().substr(1); + static void dumpBasicBlockLabel(const BasicBlock *BB, + ModuleSlotTracker &MST) { + if (BB->hasName()) { + dbgs() << BB->getName(); + return; + } + + dbgs() << MST.getLocalSlot(BB); } void dump() const { + BasicBlock *BB = EntryNode->Node->getParent(); + Function *F = BB->getParent(); + + ModuleSlotTracker MST(F->getParent()); + MST.incorporateFunction(*F); + dbgs() << "Entry ("; - dbgs() << getBasicBlockLabel(EntryNode->Node->getParent()); + dumpBasicBlockLabel(BB, MST); dbgs() << ") : " << *EntryNode->Node << "\n"; for (auto &E : Remats) { dbgs() << *(E.first) << "\n"; diff --git a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp b/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp index f18f23306befb4e0bcf1dfa167b0afe7a4521ce2..c9bb3395a9949184f24b14558599409e14114edd 100644 --- a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp +++ b/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Coroutines/SuspendCrossingInfo.h" +#include "llvm/IR/ModuleSlotTracker.h" // The "coro-suspend-crossing" flag is very noisy. There is another debug type, // "coro-frame", which results in leaner debug spew. @@ -20,24 +21,26 @@ namespace llvm { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -static std::string getBasicBlockLabel(const BasicBlock *BB) { - if (BB->hasName()) - return BB->getName().str(); - - std::string S; - raw_string_ostream OS(S); - BB->printAsOperand(OS, false); - return OS.str().substr(1); +static void dumpBasicBlockLabel(const BasicBlock *BB, ModuleSlotTracker &MST) { + if (BB->hasName()) { + dbgs() << BB->getName(); + return; + } + + dbgs() << MST.getLocalSlot(BB); } -LLVM_DUMP_METHOD void SuspendCrossingInfo::dump( - StringRef Label, BitVector const &BV, - const ReversePostOrderTraversal &RPOT) const { +LLVM_DUMP_METHOD void +SuspendCrossingInfo::dump(StringRef Label, BitVector const &BV, + const ReversePostOrderTraversal &RPOT, + ModuleSlotTracker &MST) const { dbgs() << Label << ":"; for (const BasicBlock *BB : RPOT) { auto BBNo = Mapping.blockToIndex(BB); - if (BV[BBNo]) - dbgs() << " " << getBasicBlockLabel(BB); + if (BV[BBNo]) { + dbgs() << " "; + dumpBasicBlockLabel(BB, MST); + } } dbgs() << "\n"; } @@ -49,12 +52,16 @@ LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const { BasicBlock *const B = Mapping.indexToBlock(0); Function *F = B->getParent(); + ModuleSlotTracker MST(F->getParent()); + MST.incorporateFunction(*F); + ReversePostOrderTraversal RPOT(F); for (const BasicBlock *BB : RPOT) { auto BBNo = Mapping.blockToIndex(BB); - dbgs() << getBasicBlockLabel(BB) << ":\n"; - dump(" Consumes", Block[BBNo].Consumes, RPOT); - dump(" Kills", Block[BBNo].Kills, RPOT); + dumpBasicBlockLabel(BB, MST); + dbgs() << ":\n"; + dump(" Consumes", Block[BBNo].Consumes, RPOT, MST); + dump(" Kills", Block[BBNo].Kills, RPOT, MST); } dbgs() << "\n"; } diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp index d740500ef1f8f63fdf4f8e8da7e580332de73568..b909bf5b2d7b611389953fb7a5495f9d58470efc 100644 --- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp +++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp @@ -86,15 +86,13 @@ static inline bool checkIfSupported(GlobalVariable &G) { auto U = std::move(Tmp.back()); Tmp.pop_back(); - if (Visited.contains(U)) + if (!Visited.insert(U).second) continue; if (isa(U)) I = cast(U); else Tmp.insert(Tmp.end(), U->user_begin(), U->user_end()); - - Visited.insert(U); } while (!I && !Tmp.empty()); assert(I && "thread_local global should have at least one non-constant use."); diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index afc13232ff195bb50b3ea23f16d65fca6870ec27..ac3f2bab5b096c2704d50b8e2437db616da7fcd9 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -101,7 +101,8 @@ using OffsetAndArgPart = std::pair; static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL, Value *Ptr, Type *ResElemTy, int64_t Offset) { if (Offset != 0) { - APInt APOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset); + APInt APOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset, + /*isSigned=*/true); Ptr = IRB.CreatePtrAdd(Ptr, IRB.getInt(APOffset)); } return Ptr; diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index 9e5d9ea31af6c4cf2cb6899ce1ba00fae85f2155..e8c18435bfc6bc8aeb8b034cddb5c93bfbbca035 100644 --- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -125,12 +125,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) { ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId); BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F); IRBuilder<> IRBTest(TestBB); - Function *BitsetTestFn = - Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test); - Value *Test = IRBTest.CreateCall( - BitsetTestFn, {&Addr, MetadataAsValue::get( - Ctx, ConstantAsMetadata::get(CaseTypeId))}); + Value *Test = IRBTest.CreateIntrinsic( + Intrinsic::type_test, {}, + {&Addr, + MetadataAsValue::get(Ctx, ConstantAsMetadata::get(CaseTypeId))}); BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB); BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights); diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index d1548592b1ce2643d9862e7f7779782014a80274..ed93b4491c50e45689abcd5b80c80ca85ab1cf09 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -857,9 +857,10 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { // here. Currently, this should not be possible, but special handling might be // required when new return value attributes are added. if (NRetTy->isVoidTy()) - RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); + RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy, PAL.getRetAttrs())); else - assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) && + assert(!RAttrs.overlaps( + AttributeFuncs::typeIncompatible(NRetTy, PAL.getRetAttrs())) && "Return attributes no longer compatible?"); AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); @@ -903,7 +904,8 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { // Adjust the call return attributes in case the function was changed to // return void. AttrBuilder RAttrs(F->getContext(), CallPAL.getRetAttrs()); - RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); + RAttrs.remove( + AttributeFuncs::typeIncompatible(NRetTy, CallPAL.getRetAttrs())); AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); // Declare these outside of the loops, so we can reuse them for the second diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index a7a01ca1055dd377a1ae007df6dd079ece31b838..3121659edadd81b188ca15f5e0255f09d5a00414 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -145,9 +145,10 @@ public: // function here in the meantime to decouple from that discussion. Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id, ArrayRef Tys = {}) { + if (Tys.empty()) + return Intrinsic::getDeclarationIfExists(M, Id); auto *FT = Intrinsic::getType(M->getContext(), Id, Tys); - return M->getFunction(Tys.empty() ? Intrinsic::getName(Id) - : Intrinsic::getName(Id, Tys, M, FT)); + return Intrinsic::getDeclarationIfExists(M, Id, Tys, FT); } class ExpandVariadics : public ModulePass { diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp index e36d524d7667ab3e2667576f7e0a69a8cc346ad8..eca36fb31cea0e28d43c10bd4b7885b0406faa20 100644 --- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -186,9 +186,9 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId, void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) { LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n"); Function *TypeCheckedLoadFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); - Function *TypeCheckedLoadRelativeFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load); + Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists( + &M, Intrinsic::type_checked_load_relative); auto scan = [&](Function *CheckedLoadFunc) { if (!CheckedLoadFunc) diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp index fd49b745fd750ea77a14a766090c943d595b972b..320fd893935f848ac3eb7acaabc1470dfaea3935 100644 --- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp +++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp @@ -174,11 +174,11 @@ static bool splitGlobals(Module &M) { // llvm.type.checked.load intrinsics, which indicates that splitting globals // may be beneficial. Function *TypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_test)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test); Function *TypeCheckedLoadFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); - Function *TypeCheckedLoadRelativeFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load); + Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists( + &M, Intrinsic::type_checked_load_relative); if ((!TypeTestFunc || TypeTestFunc->use_empty()) && (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) && (!TypeCheckedLoadRelativeFunc || diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 519a4e9314a26bb6856981533da88f1fd55dc32d..3fcfc6a876776d9c09b96aad2ece782f18910b85 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1970,7 +1970,7 @@ static void dropTypeTests(Module &M, Function &TypeTestFunc) { bool LowerTypeTestsModule::lower() { Function *TypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_test)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test); if (DropTypeTests) { if (TypeTestFunc) @@ -1979,7 +1979,7 @@ bool LowerTypeTestsModule::lower() { // except for in the case where we originally were performing ThinLTO but // decided not to in the backend. Function *PublicTypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::public_type_test)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test); if (PublicTypeTestFunc) dropTypeTests(M, *PublicTypeTestFunc); if (TypeTestFunc || PublicTypeTestFunc) { @@ -2002,7 +2002,7 @@ bool LowerTypeTestsModule::lower() { return false; Function *ICallBranchFunnelFunc = - M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::icall_branch_funnel); if ((!TypeTestFunc || TypeTestFunc->use_empty()) && (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) && !ExportSummary && !ImportSummary) diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 9bf29c46938eba8020efe32e60e92dd78ccfba02..cd0e412bdf353b2e81cf8777972b5a8f7d6602ed 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -123,7 +123,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) { }; if (Function *TypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_test))) { + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test)) { for (const Use &U : TypeTestFunc->uses()) { auto CI = cast(U.getUser()); ExternalizeTypeId(CI, 1); @@ -131,7 +131,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) { } if (Function *PublicTypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::public_type_test))) { + Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test)) { for (const Use &U : PublicTypeTestFunc->uses()) { auto CI = cast(U.getUser()); ExternalizeTypeId(CI, 1); @@ -139,15 +139,15 @@ void promoteTypeIds(Module &M, StringRef ModuleId) { } if (Function *TypeCheckedLoadFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) { + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load)) { for (const Use &U : TypeCheckedLoadFunc->uses()) { auto CI = cast(U.getUser()); ExternalizeTypeId(CI, 2); } } - if (Function *TypeCheckedLoadRelativeFunc = M.getFunction( - Intrinsic::getName(Intrinsic::type_checked_load_relative))) { + if (Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists( + &M, Intrinsic::type_checked_load_relative)) { for (const Use &U : TypeCheckedLoadRelativeFunc->uses()) { auto CI = cast(U.getUser()); ExternalizeTypeId(CI, 2); diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 59f986b4ca2664c427289343fbb0f1e7c26f49cc..45d32218f362c568219ca03735d08720f0acdf38 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -851,7 +851,7 @@ void llvm::updateVCallVisibilityInModule( void llvm::updatePublicTypeTestCalls(Module &M, bool WholeProgramVisibilityEnabledInLTO) { Function *PublicTypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::public_type_test)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test); if (!PublicTypeTestFunc) return; if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) { @@ -2247,12 +2247,13 @@ bool DevirtModule::run() { return false; Function *TypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_test)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test); Function *TypeCheckedLoadFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); - Function *TypeCheckedLoadRelativeFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative)); - Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load); + Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists( + &M, Intrinsic::type_checked_load_relative); + Function *AssumeFunc = + Intrinsic::getDeclarationIfExists(&M, Intrinsic::assume); // Normally if there are no users of the devirtualization intrinsics in the // module, this pass has nothing to do. But if we are exporting, we also need diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 64bee4ab974ede5f02c424f30fc6b7afc2bef204..d72013ba223d09c517874808104521cb9f1b1637 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2722,13 +2722,15 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { return BinaryOperator::CreateAnd(Builder.CreateNot(A), B); } + if (Value *Res = + foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/true, /*IsLogical=*/false)) + return replaceInstUsesWith(I, Res); + { ICmpInst *LHS = dyn_cast(Op0); ICmpInst *RHS = dyn_cast(Op1); - if (LHS && RHS) - if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ true)) - return replaceInstUsesWith(I, Res); + // TODO: Base this on foldBooleanAndOr instead? // TODO: Make this recursive; it's a little tricky because an arbitrary // number of 'and' instructions might have to be created. if (LHS && match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) { @@ -2767,11 +2769,6 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { } } - if (FCmpInst *LHS = dyn_cast(I.getOperand(0))) - if (FCmpInst *RHS = dyn_cast(I.getOperand(1))) - if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true)) - return replaceInstUsesWith(I, Res); - if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) return FoldedFCmps; @@ -3108,8 +3105,7 @@ static Instruction *matchOrConcat(Instruction &Or, Value *NewUpper = Builder.CreateZExt(Hi, Ty); NewUpper = Builder.CreateShl(NewUpper, HalfWidth); Value *BinOp = Builder.CreateOr(NewLower, NewUpper); - Function *F = Intrinsic::getOrInsertDeclaration(Or.getModule(), id, Ty); - return Builder.CreateCall(F, BinOp); + return Builder.CreateIntrinsic(id, Ty, BinOp); }; // BSWAP: Push the concat down, swapping the lower/upper sources. @@ -3369,8 +3365,14 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, // We can convert this case to bitwise and, because both operands are used // on the LHS, and as such poison from both will propagate. if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd, - /*IsLogical*/ false, Builder, Q)) + /*IsLogical=*/false, Builder, Q)) { + // If RHS is still used, we should drop samesign flag. + if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) { + RHS->setSameSign(false); + addToWorklist(RHS); + } return V; + } if (Value *V = foldIsPowerOf2OrZero(LHS, RHS, IsAnd, Builder, *this)) return V; @@ -3517,6 +3519,27 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, return foldAndOrOfICmpsUsingRanges(LHS, RHS, IsAnd); } +/// If IsLogical is true, then the and/or is in select form and the transform +/// must be poison-safe. +Value *InstCombinerImpl::foldBooleanAndOr(Value *LHS, Value *RHS, + Instruction &I, bool IsAnd, + bool IsLogical) { + if (!LHS->getType()->isIntOrIntVectorTy(1)) + return nullptr; + + if (auto *LHSCmp = dyn_cast(LHS)) + if (auto *RHSCmp = dyn_cast(RHS)) + if (Value *Res = foldAndOrOfICmps(LHSCmp, RHSCmp, I, IsAnd, IsLogical)) + return Res; + + if (auto *LHSCmp = dyn_cast(LHS)) + if (auto *RHSCmp = dyn_cast(RHS)) + if (Value *Res = foldLogicOfFCmps(LHSCmp, RHSCmp, IsAnd, IsLogical)) + return Res; + + return nullptr; +} + static Value *foldOrOfInversions(BinaryOperator &I, InstCombiner::BuilderTy &Builder) { assert(I.getOpcode() == Instruction::Or && @@ -3798,13 +3821,15 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (SwappedForXor) std::swap(Op0, Op1); + if (Value *Res = + foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/false, /*IsLogical=*/false)) + return replaceInstUsesWith(I, Res); + { ICmpInst *LHS = dyn_cast(Op0); ICmpInst *RHS = dyn_cast(Op1); - if (LHS && RHS) - if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ false)) - return replaceInstUsesWith(I, Res); + // TODO: Base this on foldBooleanAndOr instead? // TODO: Make this recursive; it's a little tricky because an arbitrary // number of 'or' instructions might have to be created. Value *X, *Y; @@ -3844,11 +3869,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { } } - if (FCmpInst *LHS = dyn_cast(I.getOperand(0))) - if (FCmpInst *RHS = dyn_cast(I.getOperand(1))) - if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false)) - return replaceInstUsesWith(I, Res); - if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) return FoldedFCmps; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 11196268cdd09fc0a23c9cd4a553302c8428a08e..b2ad70aa72262fe074a86dd71733d3e4aa80bc76 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -647,9 +647,8 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { // ctpop(x | -x) -> bitwidth - cttz(x, false) if (Op0->hasOneUse() && match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { - Function *F = - Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty); - auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); + auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty, + {X, IC.Builder.getFalse()}); auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); } @@ -1182,11 +1181,9 @@ Instruction *InstCombinerImpl::matchSAddSubSat(IntrinsicInst &MinMax1) { return nullptr; // Finally create and return the sat intrinsic, truncated to the new type - Function *F = Intrinsic::getOrInsertDeclaration(MinMax1.getModule(), - IntrinsicID, NewTy); Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy); Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy); - Value *Sat = Builder.CreateCall(F, {AT, BT}); + Value *Sat = Builder.CreateIntrinsic(IntrinsicID, NewTy, {AT, BT}); return CastInst::Create(Instruction::SExt, Sat, Ty); } @@ -4201,7 +4198,8 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { if (!CallerPAL.isEmpty() && !Caller->use_empty()) { AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs()); - if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) + if (RAttrs.overlaps(AttributeFuncs::typeIncompatible( + NewRetTy, CallerPAL.getRetAttrs()))) return false; // Attribute not compatible with transformed value. } @@ -4247,7 +4245,8 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // Check if there are any incompatible attributes we cannot drop safely. if (AttrBuilder(FT->getContext(), CallerPAL.getParamAttrs(i)) .overlaps(AttributeFuncs::typeIncompatible( - ParamTy, AttributeFuncs::ASK_UNSAFE_TO_DROP))) + ParamTy, CallerPAL.getParamAttrs(i), + AttributeFuncs::ASK_UNSAFE_TO_DROP))) return false; // Attribute not compatible with transformed value. if (Call.isInAllocaArgument(i) || @@ -4285,7 +4284,8 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // If the return value is not being used, the type may not be compatible // with the existing attributes. Wipe out any problematic attributes. - RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); + RAttrs.remove( + AttributeFuncs::typeIncompatible(NewRetTy, CallerPAL.getRetAttrs())); LLVMContext &Ctx = Call.getContext(); AI = Call.arg_begin(); @@ -4300,7 +4300,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // Add any parameter attributes except the ones incompatible with the new // type. Note that we made sure all incompatible ones are safe to drop. AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible( - ParamTy, AttributeFuncs::ASK_SAFE_TO_DROP); + ParamTy, CallerPAL.getParamAttrs(i), AttributeFuncs::ASK_SAFE_TO_DROP); ArgAttrs.push_back( CallerPAL.getParamAttrs(i).removeAttributes(Ctx, IncompatibleAttrs)); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 7129499e0f8f9db5625f857cfacdfbdbdafd8e34..72ebd9fbb6d9e538e3bc71e131d46d8b486b7f3e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1738,7 +1738,7 @@ Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp, // Compute X & (C2 << Y). Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift); - return replaceOperand(Cmp, 0, NewAnd); + return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1)); } return nullptr; @@ -1844,7 +1844,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp, /*HasNUW=*/true), One, Or->getName()); Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName()); - return replaceOperand(Cmp, 0, NewAnd); + return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1)); } } } @@ -4790,12 +4790,10 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) { if (MulHadOtherUses) Builder.SetInsertPoint(Mul); - Function *F = Intrinsic::getOrInsertDeclaration( - I.getModule(), + CallInst *Call = Builder.CreateIntrinsic( Div->getOpcode() == Instruction::UDiv ? Intrinsic::umul_with_overflow : Intrinsic::smul_with_overflow, - X->getType()); - CallInst *Call = Builder.CreateCall(F, {X, Y}, "mul"); + X->getType(), {X, Y}, /*FMFSource=*/nullptr, "mul"); // If the multiplication was used elsewhere, to ensure that we don't leave // "duplicate" instructions, replace uses of that original multiplication @@ -6334,9 +6332,9 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, MulA = Builder.CreateZExt(A, MulType); if (WidthB < MulWidth) MulB = Builder.CreateZExt(B, MulType); - Function *F = Intrinsic::getOrInsertDeclaration( - I.getModule(), Intrinsic::umul_with_overflow, MulType); - CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul"); + CallInst *Call = + Builder.CreateIntrinsic(Intrinsic::umul_with_overflow, MulType, + {MulA, MulB}, /*FMFSource=*/nullptr, "umul"); IC.addToWorklist(MulInstr); // If there are uses of mul result other than the comparison, we know that @@ -6771,11 +6769,15 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) { } // Turn a signed comparison into an unsigned one if both operands are known to - // have the same sign. - if (I.isSigned() && + // have the same sign. Set samesign if possible (except for equality + // predicates). + if ((I.isSigned() || (I.isUnsigned() && !I.hasSameSign())) && ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) || - (Op0Known.One.isNegative() && Op1Known.One.isNegative()))) - return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1); + (Op0Known.One.isNegative() && Op1Known.One.isNegative()))) { + I.setPredicate(I.getUnsignedPredicate()); + I.setSameSign(); + return &I; + } return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index d54e337ceb6c568915629e396e959368002bc49d..3c53aa4e1a3a6f5906b60df5fdef48fa48a15c1f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -426,6 +426,9 @@ private: Instruction *foldLogicOfIsFPClass(BinaryOperator &Operator, Value *LHS, Value *RHS); + Value *foldBooleanAndOr(Value *LHS, Value *RHS, Instruction &I, bool IsAnd, + bool IsLogical); + Instruction * canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 820d3608c8dc49c580d4022bd7a2ef86fa1c6472..c5f39a4c381ed1d418e83a2d0ba5d835a1f90a2e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1448,6 +1448,7 @@ Instruction *InstCombinerImpl::foldSelectEqualityTest(SelectInst &Sel) { m_c_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(X), m_Specific(Y)))) return nullptr; + cast(XeqY)->setSameSign(false); return replaceInstUsesWith(Sel, XeqY); } @@ -1953,56 +1954,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, return &SI; } - // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring - // decomposeBitTestICmp() might help. - if (TrueVal->getType()->isIntOrIntVectorTy()) { - unsigned BitWidth = - DL.getTypeSizeInBits(TrueVal->getType()->getScalarType()); - APInt MinSignedValue = APInt::getSignedMinValue(BitWidth); - Value *X; - const APInt *Y, *C; - bool TrueWhenUnset; - bool IsBitTest = false; - if (ICmpInst::isEquality(Pred) && - match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) && - match(CmpRHS, m_Zero())) { - IsBitTest = true; - TrueWhenUnset = Pred == ICmpInst::ICMP_EQ; - } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) { - X = CmpLHS; - Y = &MinSignedValue; - IsBitTest = true; - TrueWhenUnset = false; - } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) { - X = CmpLHS; - Y = &MinSignedValue; - IsBitTest = true; - TrueWhenUnset = true; - } - if (IsBitTest) { - Value *V = nullptr; - // (X & Y) == 0 ? X : X ^ Y --> X & ~Y - if (TrueWhenUnset && TrueVal == X && - match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) - V = Builder.CreateAnd(X, ~(*Y)); - // (X & Y) != 0 ? X ^ Y : X --> X & ~Y - else if (!TrueWhenUnset && FalseVal == X && - match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) - V = Builder.CreateAnd(X, ~(*Y)); - // (X & Y) == 0 ? X ^ Y : X --> X | Y - else if (TrueWhenUnset && FalseVal == X && - match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) - V = Builder.CreateOr(X, *Y); - // (X & Y) != 0 ? X : X ^ Y --> X | Y - else if (!TrueWhenUnset && TrueVal == X && - match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) - V = Builder.CreateOr(X, *Y); - - if (V) - return replaceInstUsesWith(SI, V); - } - } - if (Instruction *V = foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder)) return V; @@ -3192,12 +3143,6 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal))); } - if (auto *LHS = dyn_cast(CondVal)) - if (auto *RHS = dyn_cast(FalseVal)) - if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false, - /*IsSelectLogical*/ true)) - return replaceInstUsesWith(SI, V); - // (A && B) || (C && B) --> (A || C) && B if (match(CondVal, m_LogicalAnd(m_Value(A), m_Value(B))) && match(FalseVal, m_LogicalAnd(m_Value(C), m_Value(D))) && @@ -3240,12 +3185,6 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal))); } - if (auto *LHS = dyn_cast(CondVal)) - if (auto *RHS = dyn_cast(TrueVal)) - if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true, - /*IsSelectLogical*/ true)) - return replaceInstUsesWith(SI, V); - // (A || B) && (C || B) --> (A && C) || B if (match(CondVal, m_LogicalOr(m_Value(A), m_Value(B))) && match(TrueVal, m_LogicalOr(m_Value(C), m_Value(D))) && @@ -3354,11 +3293,9 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { return replaceInstUsesWith(SI, Op1); } - if (auto *ICmp0 = dyn_cast(CondVal)) - if (auto *ICmp1 = dyn_cast(Op1)) - if (auto *V = foldAndOrOfICmps(ICmp0, ICmp1, SI, IsAnd, - /* IsLogical */ true)) - return replaceInstUsesWith(SI, V); + if (auto *V = foldBooleanAndOr(CondVal, Op1, SI, IsAnd, + /*IsLogical=*/true)) + return replaceInstUsesWith(SI, V); } // select (a || b), c, false -> select a, c, false @@ -3837,11 +3774,11 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { // minnum/maxnum intrinsics. if (SIFPOp->hasNoNaNs() && SIFPOp->hasNoSignedZeros()) { Value *X, *Y; - if (match(&SI, m_OrdFMax(m_Value(X), m_Value(Y)))) + if (match(&SI, m_OrdOrUnordFMax(m_Value(X), m_Value(Y)))) return replaceInstUsesWith( SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI)); - if (match(&SI, m_OrdFMin(m_Value(X), m_Value(Y)))) + if (match(&SI, m_OrdOrUnordFMin(m_Value(X), m_Value(Y)))) return replaceInstUsesWith( SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI)); } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 02d9fab309d83be8231775fbe832e444ca240e8f..cb84588318496ce2bbbc24bd8d5f406737246fad 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1109,11 +1109,8 @@ struct FunctionStackPoisoner : public InstVisitor { // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for // this purpose. if (!isa(InstBefore)) { - Function *DynamicAreaOffsetFunc = Intrinsic::getOrInsertDeclaration( - InstBefore->getModule(), Intrinsic::get_dynamic_area_offset, - {IntptrTy}); - - Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {}); + Value *DynamicAreaOffset = IRB.CreateIntrinsic( + Intrinsic::get_dynamic_area_offset, {IntptrTy}, {}); DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy), DynamicAreaOffset); @@ -1865,11 +1862,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, if (UseCalls && ClOptimizeCallbacks) { const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex); - Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - IRB.CreateCall( - Intrinsic::getOrInsertDeclaration(M, Intrinsic::asan_check_memaccess), - {IRB.CreatePointerCast(Addr, PtrTy), - ConstantInt::get(Int32Ty, AccessInfo.Packed)}); + IRB.CreateIntrinsic(Intrinsic::asan_check_memaccess, {}, + {IRB.CreatePointerCast(Addr, PtrTy), + ConstantInt::get(Int32Ty, AccessInfo.Packed)}); return; } diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 63d580d2b9d5124715497b1a74969f001a026a03..8b857d421f29f31fcd13e6a3b3699f4b3a7d921a 100644 --- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -194,14 +194,13 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, IRB.SetInsertPoint(TrapBB); Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap; - auto *F = Intrinsic::getOrInsertDeclaration(Fn->getParent(), IntrID); CallInst *TrapCall; if (DebugTrapBB) { - TrapCall = - IRB.CreateCall(F, ConstantInt::get(IRB.getInt8Ty(), Fn->size())); + TrapCall = IRB.CreateIntrinsic( + IntrID, {}, ConstantInt::get(IRB.getInt8Ty(), Fn->size())); } else { - TrapCall = IRB.CreateCall(F, {}); + TrapCall = IRB.CreateIntrinsic(IntrID, {}, {}); } TrapCall->setDoesNotReturn(); diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 577647cac3f58ce4c9e4ee0348d158252ffa8692..e226727e64d350faa22c8537cc956fd559f1daf0 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1305,8 +1305,8 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(), NewFName, F->getParent()); NewF->copyAttributesFrom(F); - NewF->removeRetAttrs( - AttributeFuncs::typeIncompatible(NewFT->getReturnType())); + NewF->removeRetAttrs(AttributeFuncs::typeIncompatible( + NewFT->getReturnType(), NewF->getAttributes().getRetAttrs())); BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF); if (F->isVarArg()) { diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index a409f6150a71c15dc53ac1799d7c96a3e078b14d..2ea89be40a3d46c9f44f8d2cfd766d4f07ba2eae 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/CRC.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -121,8 +122,11 @@ private: Function *createInternalFunction(FunctionType *FTy, StringRef Name, StringRef MangledType = ""); + void emitGlobalConstructor( SmallVectorImpl> &CountersBySP); + void emitModuleInitFunctionPtrs( + SmallVectorImpl> &CountersBySP); bool isFunctionInstrumented(const Function &F); std::vector createRegexesFromString(StringRef RegexesStr); @@ -913,6 +917,9 @@ bool GCOVProfiler::emitProfileNotes( GlobalVariable *Counters = new GlobalVariable( *M, CounterTy, false, GlobalValue::InternalLinkage, Constant::getNullValue(CounterTy), "__llvm_gcov_ctr"); + const llvm::Triple &Triple = llvm::Triple(M->getTargetTriple()); + if (Triple.getObjectFormat() == llvm::Triple::XCOFF) + Counters->setSection("__llvm_gcov_ctr_section"); CountersBySP.emplace_back(Counters, SP); for (size_t I : llvm::seq(0, Measured)) { @@ -979,7 +986,11 @@ bool GCOVProfiler::emitProfileNotes( } if (EmitGCDA) { - emitGlobalConstructor(CountersBySP); + const llvm::Triple &Triple = llvm::Triple(M->getTargetTriple()); + if (Triple.getObjectFormat() == llvm::Triple::XCOFF) + emitModuleInitFunctionPtrs(CountersBySP); + else + emitGlobalConstructor(CountersBySP); EmitGCDA = false; } } @@ -1028,6 +1039,40 @@ void GCOVProfiler::emitGlobalConstructor( appendToGlobalCtors(*M, F, 0); } +void GCOVProfiler::emitModuleInitFunctionPtrs( + SmallVectorImpl> &CountersBySP) { + Function *WriteoutF = insertCounterWriteout(CountersBySP); + Function *ResetF = insertReset(CountersBySP); + + // Instead of creating a function call and add it to the constructors list, + // create a global variable in the __llvm_covinit section so the functions + // can be registered by a constructor in the runtime. + + auto &Ctx = M->getContext(); + + Type *InitFuncDataTy[] = { +#define COVINIT_FUNC(Type, LLVMType, Name, Init) LLVMType, +#include "llvm/ProfileData/InstrProfData.inc" + }; + + auto STy = StructType::get(Ctx, ArrayRef(InitFuncDataTy)); + + Constant *InitFuncPtrs[] = { +#define COVINIT_FUNC(Type, LLVMType, Name, Init) Init, +#include "llvm/ProfileData/InstrProfData.inc" + }; + + auto *CovInitGV = + new GlobalVariable(*M, STy, false, GlobalValue::PrivateLinkage, nullptr, + "__llvm_covinit_functions"); + CovInitGV->setInitializer(ConstantStruct::get(STy, InitFuncPtrs)); + CovInitGV->setVisibility(GlobalValue::VisibilityTypes::DefaultVisibility); + CovInitGV->setSection(getInstrProfSectionName( + IPSK_covinit, Triple(M->getTargetTriple()).getObjectFormat())); + CovInitGV->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT)); + CovInitGV->setConstant(true); +} + FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) { Type *Args[] = { PointerType::getUnqual(*Ctx), // const char *orig_filename diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 5ec4973ea03d8ff28fed216a7c5534384a9164cb..21d4d37d7e6cdb58506d05dbca54c0943d63b6a8 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1025,7 +1025,6 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite, insertShadowTagCheck(Ptr, InsertBefore, DTU, LI).TagMismatchTerm; IRBuilder<> IRB(InsertBefore); - Module *M = IRB.GetInsertBlock()->getParent()->getParent(); bool UseFixedShadowIntrinsic = false; // The memaccess fixed shadow intrinsic is only supported on AArch64, // which allows a 16-bit immediate to be left-shifted by 32. @@ -1041,19 +1040,18 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite, } if (UseFixedShadowIntrinsic) { - IRB.CreateCall( - Intrinsic::getOrInsertDeclaration( - M, UseShortGranules - ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow - : Intrinsic::hwasan_check_memaccess_fixedshadow), + IRB.CreateIntrinsic( + UseShortGranules + ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow + : Intrinsic::hwasan_check_memaccess_fixedshadow, + {}, {Ptr, ConstantInt::get(Int32Ty, AccessInfo), ConstantInt::get(Int64Ty, Mapping.offset())}); } else { - IRB.CreateCall(Intrinsic::getOrInsertDeclaration( - M, UseShortGranules - ? Intrinsic::hwasan_check_memaccess_shortgranules - : Intrinsic::hwasan_check_memaccess), - {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)}); + IRB.CreateIntrinsic( + UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules + : Intrinsic::hwasan_check_memaccess, + {}, {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)}); } } diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 86637109d94083877c1e02a5fc4be89821c2bfbf..43b8d5e6a8ce5bd86b120b9625d34b4b8aad852f 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -944,7 +944,7 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM, // Find out virtual calls by looking at users of llvm.type.checked.load in // that case. Function *TypeTestFunc = - M.getFunction(Intrinsic::getName(Intrinsic::type_test)); + Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test); if (!TypeTestFunc || TypeTestFunc->use_empty()) return; diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 929c787442057a3753fecdfc1a8d535cfb742f13..d7d809dfdd5f6599c2b4d838fa6d83ba074eff1b 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -902,15 +902,15 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) { /// Check if the module contains uses of any profiling intrinsics. static bool containsProfilingIntrinsics(Module &M) { auto containsIntrinsic = [&](int ID) { - if (auto *F = M.getFunction(Intrinsic::getName(ID))) + if (auto *F = Intrinsic::getDeclarationIfExists(&M, ID)) return !F->use_empty(); return false; }; - return containsIntrinsic(llvm::Intrinsic::instrprof_cover) || - containsIntrinsic(llvm::Intrinsic::instrprof_increment) || - containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) || - containsIntrinsic(llvm::Intrinsic::instrprof_timestamp) || - containsIntrinsic(llvm::Intrinsic::instrprof_value_profile); + return containsIntrinsic(Intrinsic::instrprof_cover) || + containsIntrinsic(Intrinsic::instrprof_increment) || + containsIntrinsic(Intrinsic::instrprof_increment_step) || + containsIntrinsic(Intrinsic::instrprof_timestamp) || + containsIntrinsic(Intrinsic::instrprof_value_profile); } bool InstrLowerer::lower() { diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp index bbe0f4c6178192d9d8af3abd017614a1dbdb7c3b..4b653a83a896e7b5b5697c60b82660e15652693a 100644 --- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp +++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp @@ -110,8 +110,7 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) { Instruction *ThenTerm = SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights); Builder.SetInsertPoint(ThenTerm); - Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap)); + Builder.CreateIntrinsic(Intrinsic::debugtrap, {}, {}); ++NumKCFIChecks; } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 19ec97c17f31c6609b8a19989deaf9ce2bf560d5..9e174e2415e71971db39d04a0a620d0cb175941c 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2853,9 +2853,8 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)), S2->getType()); Value *V2 = I.getOperand(2); - Function *Intrin = Intrinsic::getOrInsertDeclaration( - I.getModule(), I.getIntrinsicID(), S2Conv->getType()); - Value *Shift = IRB.CreateCall(Intrin, {S0, S1, V2}); + Value *Shift = IRB.CreateIntrinsic(I.getIntrinsicID(), S2Conv->getType(), + {S0, S1, V2}); setShadow(&I, IRB.CreateOr(Shift, S2Conv)); setOriginForNaryOp(I); } @@ -3057,9 +3056,8 @@ struct MemorySanitizerVisitor : public InstVisitor { IRBuilder<> IRB(&I); Value *Op = I.getArgOperand(0); Type *OpType = Op->getType(); - Function *BswapFunc = Intrinsic::getOrInsertDeclaration( - F.getParent(), Intrinsic::bswap, ArrayRef(&OpType, 1)); - setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op))); + setShadow(&I, IRB.CreateIntrinsic(Intrinsic::bswap, ArrayRef(&OpType, 1), + getShadow(Op))); setOrigin(&I, getOrigin(Op)); } @@ -3287,11 +3285,9 @@ struct MemorySanitizerVisitor : public InstVisitor { S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64)); } - Function *ShadowFn = Intrinsic::getOrInsertDeclaration( - F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID())); - - Value *S = - IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack"); + Value *S = IRB.CreateIntrinsic(getSignedPackIntrinsic(I.getIntrinsicID()), + {}, {S1_ext, S2_ext}, /*FMFSource=*/nullptr, + "_msprop_vector_pack"); if (MMXEltSizeInBits) S = IRB.CreateBitCast(S, getShadowTy(&I)); setShadow(&I, S); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index dbe908bb5e72f3fbeae9708318e12e78f9a0f0ca..919660e7a040f21734134e356da151d4da8e73d0 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -918,8 +918,8 @@ void FunctionInstrumenter::instrument() { IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt()); // llvm.instrprof.cover(i8* , i64 , i32 , // i32 ) - Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_cover), + Builder.CreateIntrinsic( + Intrinsic::instrprof_cover, {}, {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); return; } @@ -971,10 +971,10 @@ void FunctionInstrumenter::instrument() { IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt()); // llvm.instrprof.timestamp(i8* , i64 , i32 , // i32 ) - Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_timestamp), - {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), - Builder.getInt32(I)}); + Builder.CreateIntrinsic(Intrinsic::instrprof_timestamp, {}, + {NormalizedNamePtr, CFGHash, + Builder.getInt32(NumCounters), + Builder.getInt32(I)}); I += PGOBlockCoverage ? 8 : 1; } @@ -984,12 +984,12 @@ void FunctionInstrumenter::instrument() { "Cannot get the Instrumentation point"); // llvm.instrprof.increment(i8* , i64 , i32 , // i32 ) - Builder.CreateCall(Intrinsic::getOrInsertDeclaration( - &M, PGOBlockCoverage - ? Intrinsic::instrprof_cover - : Intrinsic::instrprof_increment), - {NormalizedNamePtr, CFGHash, - Builder.getInt32(NumCounters), Builder.getInt32(I++)}); + Builder.CreateIntrinsic(PGOBlockCoverage ? Intrinsic::instrprof_cover + : Intrinsic::instrprof_increment, + {}, + {NormalizedNamePtr, CFGHash, + Builder.getInt32(NumCounters), + Builder.getInt32(I++)}); } // Now instrument select instructions: @@ -1726,10 +1726,10 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { auto *NormalizedFuncNameVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( FuncNameVar, PointerType::get(M->getContext(), 0)); - Builder.CreateCall( - Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step), - {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash), - Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); + Builder.CreateIntrinsic(Intrinsic::instrprof_increment_step, {}, + {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash), + Builder.getInt32(TotalNumCtrs), + Builder.getInt32(*CurCtrIdx), Step}); ++(*CurCtrIdx); } @@ -1916,7 +1916,6 @@ static bool InstrumentAllFunctions( std::unordered_multimap ComdatMembers; collectComdatMembers(M, ComdatMembers); - bool AnythingInstrumented = false; for (auto &F : M) { if (skipPGOGen(F)) continue; @@ -1926,9 +1925,8 @@ static bool InstrumentAllFunctions( FunctionInstrumenter FI(M, F, TLI, ComdatMembers, BPI, BFI, InstrumentationType); FI.instrument(); - AnythingInstrumented = true; } - return AnythingInstrumented; + return true; } PreservedAnalyses diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 719806fdf37f58a364168dfb2a2e7f9dda549f11..f7461127ec51cfb8cd3e33e727f4d6400eb4a4b6 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/EHPersonalities.h" @@ -28,6 +29,8 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/SpecialCaseList.h" #include "llvm/Support/VirtualFileSystem.h" @@ -82,8 +85,10 @@ const char SanCovCountersSectionName[] = "sancov_cntrs"; const char SanCovBoolFlagSectionName[] = "sancov_bools"; const char SanCovPCsSectionName[] = "sancov_pcs"; const char SanCovCFsSectionName[] = "sancov_cfs"; +const char SanCovCallbackGateSectionName[] = "sancov_gate"; const char SanCovLowestStackName[] = "__sancov_lowest_stack"; +const char SanCovCallbackGateName[] = "__sancov_should_track"; static cl::opt ClCoverageLevel( "sanitizer-coverage-level", @@ -152,6 +157,12 @@ static cl::opt ClCollectCF("sanitizer-coverage-control-flow", cl::desc("collect control flow for each function"), cl::Hidden); +static cl::opt ClGatedCallbacks( + "sanitizer-coverage-gated-trace-callbacks", + cl::desc("Gate the invocation of the tracing callbacks on a global " + "variable. Currently only supported for trace-pc-guard."), + cl::Hidden, cl::init(false)); + namespace { SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) { @@ -194,6 +205,7 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { Options.StackDepth |= ClStackDepth; Options.TraceLoads |= ClLoadTracing; Options.TraceStores |= ClStoreTracing; + Options.GatedCallbacks |= ClGatedCallbacks; if (!Options.TracePCGuard && !Options.TracePC && !Options.Inline8bitCounters && !Options.StackDepth && !Options.InlineBoolFlag && !Options.TraceLoads && !Options.TraceStores) @@ -239,8 +251,9 @@ private: const char *Section); GlobalVariable *CreatePCArray(Function &F, ArrayRef AllBlocks); void CreateFunctionLocalArrays(Function &F, ArrayRef AllBlocks); + Value *CreateFunctionLocalGateCmp(IRBuilder<> &IRB); void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx, - bool IsLeafFunc = true); + Value *&FunctionGateCmp, bool IsLeafFunc = true); Function *CreateInitCallsForSections(Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty, const char *Section); @@ -265,6 +278,7 @@ private: FunctionCallee SanCovTraceGepFunction; FunctionCallee SanCovTraceSwitchFunction; GlobalVariable *SanCovLowestStack; + GlobalVariable *SanCovCallbackGate; Type *PtrTy, *IntptrTy, *Int64Ty, *Int32Ty, *Int16Ty, *Int8Ty, *Int1Ty; Module *CurModule; std::string CurModuleUniqueId; @@ -478,6 +492,23 @@ bool ModuleSanitizerCoverage::instrumentModule() { if (Options.StackDepth && !SanCovLowestStack->isDeclaration()) SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy)); + if (Options.GatedCallbacks) { + if (!Options.TracePCGuard) { + C->emitError(StringRef("'") + ClGatedCallbacks.ArgStr + + "' is only supported with trace-pc-guard"); + return true; + } + + SanCovCallbackGate = cast( + M.getOrInsertGlobal(SanCovCallbackGateName, Int64Ty)); + SanCovCallbackGate->setSection( + getSectionName(SanCovCallbackGateSectionName)); + SanCovCallbackGate->setInitializer(Constant::getNullValue(Int64Ty)); + SanCovCallbackGate->setLinkage(GlobalVariable::LinkOnceAnyLinkage); + SanCovCallbackGate->setVisibility(GlobalVariable::HiddenVisibility); + appendToCompilerUsed(M, SanCovCallbackGate); + } + SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy); SanCovTracePCGuard = M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, PtrTy); @@ -777,13 +808,22 @@ void ModuleSanitizerCoverage::CreateFunctionLocalArrays( FunctionPCsArray = CreatePCArray(F, AllBlocks); } +Value *ModuleSanitizerCoverage::CreateFunctionLocalGateCmp(IRBuilder<> &IRB) { + auto Load = IRB.CreateLoad(Int64Ty, SanCovCallbackGate); + Load->setNoSanitizeMetadata(); + auto Cmp = IRB.CreateIsNotNull(Load); + Cmp->setName("sancov gate cmp"); + return Cmp; +} + bool ModuleSanitizerCoverage::InjectCoverage(Function &F, ArrayRef AllBlocks, bool IsLeafFunc) { if (AllBlocks.empty()) return false; CreateFunctionLocalArrays(F, AllBlocks); + Value *FunctionGateCmp = nullptr; for (size_t i = 0, N = AllBlocks.size(); i < N; i++) - InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc); + InjectCoverageAtBlock(F, *AllBlocks[i], i, FunctionGateCmp, IsLeafFunc); return true; } @@ -946,6 +986,7 @@ void ModuleSanitizerCoverage::InjectTraceForCmp( void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx, + Value *&FunctionGateCmp, bool IsLeafFunc) { BasicBlock::iterator IP = BB.getFirstInsertionPt(); bool IsEntryBB = &BB == &F.getEntryBlock(); @@ -971,7 +1012,23 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy), ConstantInt::get(IntptrTy, Idx * 4)), PtrTy); - IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge(); + if (Options.GatedCallbacks) { + if (!FunctionGateCmp) { + // Create this in the entry block + assert(IsEntryBB); + FunctionGateCmp = CreateFunctionLocalGateCmp(IRB); + } + // Set the branch weights in order to minimize the price paid when the + // gate is turned off, allowing the default enablement of this + // instrumentation with as little of a performance cost as possible + auto Weights = MDBuilder(*C).createBranchWeights(1, 100000); + auto ThenTerm = + SplitBlockAndInsertIfThen(FunctionGateCmp, &*IP, false, Weights); + IRBuilder<> ThenIRB(ThenTerm); + ThenIRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge(); + } else { + IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge(); + } } if (Options.Inline8bitCounters) { auto CounterPtr = IRB.CreateGEP( @@ -999,11 +1056,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, if (Options.StackDepth && IsEntryBB && !IsLeafFunc) { // Check stack depth. If it's the deepest so far, record it. Module *M = F.getParent(); - Function *GetFrameAddr = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::frameaddress, - IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace())); - auto FrameAddrPtr = - IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)}); + auto FrameAddrPtr = IRB.CreateIntrinsic( + Intrinsic::frameaddress, + IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()), + {Constant::getNullValue(Int32Ty)}); auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy); auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack); auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack); diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 388addfab181a45e262495b859d15fcaa1a4fc82..915dc70336ded928a5f2c9dc5d70c193347af1a7 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -572,9 +572,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, if ((Res || HasCalls) && ClInstrumentFuncEntryExit) { InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI()); Value *ReturnAddress = - IRB.CreateCall(Intrinsic::getOrInsertDeclaration( - F.getParent(), Intrinsic::returnaddress), - IRB.getInt32(0)); + IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0)); IRB.CreateCall(TsanFuncEntry, ReturnAddress); EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions); diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp index 98ecbe400543e1680ec566d02becc577cec14863..9d23c899430095434a797e842a27d121d7e576a3 100644 --- a/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -398,9 +398,9 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) { } Value *Float2IntPass::convert(Instruction *I, Type *ToTy) { - if (ConvertedInsts.contains(I)) + if (auto It = ConvertedInsts.find(I); It != ConvertedInsts.end()) // Already converted this instruction. - return ConvertedInsts[I]; + return It->second; SmallVector NewOperands; for (Value *V : I->operands()) { diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index e7ff2a14469c5e22f595965d9ad97e4c350e7cf1..7fa9f42809091f6b072d924ce49d2a188b0675f1 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -980,11 +980,11 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { PreservedAnalyses GuardWideningPass::run(Function &F, FunctionAnalysisManager &AM) { // Avoid requesting analyses if there are no guards or widenable conditions. - auto *GuardDecl = F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::experimental_guard); bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty(); - auto *WCDecl = F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_widenable_condition)); + auto *WCDecl = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::experimental_widenable_condition); bool HasWidenableConditions = WCDecl && !WCDecl->use_empty(); if (!HasIntrinsicGuards && !HasWidenableConditions) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 2668305e9c8447729a5e15123a2af8ed51b12e5f..ad68fc1f21e2c2573450e38ff3a89e80bd66bc4a 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -598,8 +598,8 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L, LoopInfo *LI) { SmallVector WideIVs; - auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + L->getBlocks()[0]->getModule(), Intrinsic::experimental_guard); bool HasGuards = GuardDecl && !GuardDecl->use_empty(); SmallVector LoopPhis; diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index 6e0c206bd198058e51ec8a8b971b2ff480380139..21d373790ac53d54ce5fc3d435e7de97bfc5bbc6 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -25,21 +25,14 @@ using namespace llvm; static bool tryToImproveAlign( const DataLayout &DL, Instruction *I, function_ref Fn) { - if (auto *LI = dyn_cast(I)) { - Value *PtrOp = LI->getPointerOperand(); - Align OldAlign = LI->getAlign(); - Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(LI->getType())); - if (NewAlign > OldAlign) { - LI->setAlignment(NewAlign); - return true; - } - } else if (auto *SI = dyn_cast(I)) { - Value *PtrOp = SI->getPointerOperand(); - Value *ValOp = SI->getValueOperand(); - Align OldAlign = SI->getAlign(); - Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(ValOp->getType())); + + if (auto *PtrOp = getLoadStorePointerOperand(I)) { + Align OldAlign = getLoadStoreAlignment(I); + Align PrefAlign = DL.getPrefTypeAlign(getLoadStoreType(I)); + + Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign); if (NewAlign > OldAlign) { - SI->setAlignment(NewAlign); + setLoadStoreAlignment(I, NewAlign); return true; } } diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 7a0b661a07799a8ad7a17b08ddfdad742244c3f6..11fdc39464dfb6b4e689576f952786050003ed4b 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -296,8 +296,8 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_, DTU = std::move(DTU_); BFI = BFI_; BPI = BPI_; - auto *GuardDecl = F->getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + F->getParent(), Intrinsic::experimental_guard); HasGuards = GuardDecl && !GuardDecl->use_empty(); // Reduce the number of instructions duplicated when optimizing strictly for diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 4043c0e9a7ddc4ec3751aedb08432b4f7b77d72d..1050cbaa07b82b7fcb0ea8c79bfe3d88649629da 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -403,15 +403,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt); IRBuilder<> Builder(P.InsertPt); - Module *M = BB->getParent()->getParent(); Type *I32 = Type::getInt32Ty(BB->getContext()); - Function *PrefetchFunc = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall( - PrefetchFunc, - {PrefPtrValue, - ConstantInt::get(I32, P.Writes), - ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + Builder.CreateIntrinsic(Intrinsic::prefetch, PrefPtrValue->getType(), + {PrefPtrValue, ConstantInt::get(I32, P.Writes), + ConstantInt::get(I32, 3), + ConstantInt::get(I32, 1)}); ++NumPrefetches; LLVM_DEBUG(dbgs() << " Access: " << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1) diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index 30369ed7c245cf666a44a6f7896e0df395e0fb56..f3e992c03917898a0042a0a0c057ae98e48dfa06 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -978,10 +978,10 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, assert(match(Br->getCondition(), m_Zero()) && "Expected branch condition to be false"); IRBuilder<> Builder(Br); - Function *F = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::umul_with_overflow, FI.OuterTripCount->getType()); - Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount}, - "flatten.mul"); + Value *Call = Builder.CreateIntrinsic( + Intrinsic::umul_with_overflow, FI.OuterTripCount->getType(), + {FI.OuterTripCount, FI.InnerTripCount}, + /*FMFSource=*/nullptr, "flatten.mul"); FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount"); Value *Overflow = Builder.CreateExtractValue(Call, 1, "flatten.overflow"); Br->setCondition(Overflow); diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 56006d9ae6924ad3c009ae021bf77c5d8e30028f..2052fc6dadd09f4a94d78694630ebce5f5d4fee5 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -2121,9 +2121,7 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, Value *Ops[] = {Val}; Type *Tys[] = {Val->getType()}; - Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctpop, Tys); - CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops); CI->setDebugLoc(DL); return CI; @@ -2135,9 +2133,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)}; Type *Tys[] = {Val->getType()}; - Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getOrInsertDeclaration(M, IID, Tys); - CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops); CI->setDebugLoc(DL); return CI; diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp index 209b083a4e91a30c15457f58256eaeeca2a2dda1..31694ad1fa508acc646824aeb716bcc3b3588e97 100644 --- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -1193,10 +1193,10 @@ bool LoopPredication::runOnLoop(Loop *Loop) { // There is nothing to do if the module doesn't use guards auto *GuardDecl = - M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard)); + Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard); bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty(); - auto *WCDecl = M->getFunction( - Intrinsic::getName(Intrinsic::experimental_widenable_condition)); + auto *WCDecl = Intrinsic::getDeclarationIfExists( + M, Intrinsic::experimental_widenable_condition); bool HasWidenableConditions = PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty(); if (!HasIntrinsicGuards && !HasWidenableConditions) diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 575395eda1c5bbe816863e8615d6b028befed8b7..e55b8f6652e319ae5d5f23954cd41524fe75e84c 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -622,6 +622,9 @@ static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) { /// representation. /// \see Formula::BaseRegs. bool Formula::isCanonical(const Loop &L) const { + assert((Scale == 0 || ScaledReg) && + "ScaledReg must be non-null if Scale is non-zero"); + if (!ScaledReg) return BaseRegs.size() <= 1; @@ -3973,9 +3976,10 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, F.UnfoldedOffset = Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() + InnerSumSC->getValue()->getZExtValue()); - if (IsScaledReg) + if (IsScaledReg) { F.ScaledReg = nullptr; - else + F.Scale = 0; + } else F.BaseRegs.erase(F.BaseRegs.begin() + Idx); } else if (IsScaledReg) F.ScaledReg = InnerSum; diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index ce35349376c483ffb742bacf8d3d7a3f49bae4e5..5f3e612e73b66190a33b6e71d596b4be060c6410 100644 --- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -27,8 +27,8 @@ using namespace llvm; static bool lowerGuardIntrinsic(Function &F) { // Check if we can cheaply rule out the possibility of not having any work to // do. - auto *GuardDecl = F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::experimental_guard); if (!GuardDecl || GuardDecl->use_empty()) return false; diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index f2ea9f8faf84cd708f9483f784de50797e7b7883..e323b391179ee0eca3b8f0714f4dae03fc8394a4 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1290,9 +1290,8 @@ public: if (AllowContraction) { // Use fmuladd for floating point operations and let the backend decide // if that's profitable. - Function *FMulAdd = Intrinsic::getOrInsertDeclaration( - Func.getParent(), Intrinsic::fmuladd, A->getType()); - return Builder.CreateCall(FMulAdd, {A, B, Sum}); + return Builder.CreateIntrinsic(Intrinsic::fmuladd, A->getType(), + {A, B, Sum}); } NumComputeOps += getNumOps(A->getType()); Value *Mul = Builder.CreateFMul(A, B); diff --git a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp index 3c977b816a05067cf5708d58517fbc4414dcfb21..ea2b419b17a59c1d2a0593ee6f91cc18f89f026c 100644 --- a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp +++ b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp @@ -26,8 +26,8 @@ using namespace llvm; static bool lowerWidenableCondition(Function &F) { // Check if we can cheaply rule out the possibility of not having any work to // do. - auto *WCDecl = F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_widenable_condition)); + auto *WCDecl = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::experimental_widenable_condition); if (!WCDecl || WCDecl->use_empty()) return false; diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index b9f88ba4e0780e00b3fa367f6aa1e19576418fc5..948466c675e974395ef02ae71fa74b802a870954 100644 --- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -56,8 +56,8 @@ static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) { static bool explicifyGuards(Function &F) { // Check if we can cheaply rule out the possibility of not having any work to // do. - auto *GuardDecl = F.getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::experimental_guard); if (!GuardDecl || GuardDecl->use_empty()) return false; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index f3f5ffb6b61b475be71eaa37e45e855b0530e47c..aa3cbc5e4bddc2eab6e5b8be8d10637b7b341192 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -2920,8 +2920,8 @@ static bool collectUnswitchCandidates( // Whether or not we should also collect guards in the loop. bool CollectGuards = false; if (UnswitchGuards) { - auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction( - Intrinsic::getName(Intrinsic::experimental_guard)); + auto *GuardDecl = Intrinsic::getDeclarationIfExists( + L.getHeader()->getParent()->getParent(), Intrinsic::experimental_guard); if (GuardDecl && !GuardDecl->use_empty()) CollectGuards = true; } diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index 4ead5cdcf29c01482c78135934070399630e7eac..17cba2e642a19a59f97e14cc38c6cf4a7ee809ee 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -529,7 +529,8 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, // Remove any incompatible attributes for the argument. AttrBuilder ArgAttrs(Ctx, CallerPAL.getParamAttrs(ArgNo)); - ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy)); + ArgAttrs.remove(AttributeFuncs::typeIncompatible( + FormalTy, CallerPAL.getParamAttrs(ArgNo))); // We may have a different byval/inalloca type. if (ArgAttrs.getByValType()) @@ -549,7 +550,8 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, AttrBuilder RAttrs(Ctx, CallerPAL.getRetAttrs()); if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) { createRetBitCast(CB, CallSiteRetTy, RetBitCast); - RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy)); + RAttrs.remove( + AttributeFuncs::typeIncompatible(CalleeRetTy, CallerPAL.getRetAttrs())); AttributeChanged = true; } diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index c6ba85bd9e57d458af92f0db3e628d926f9f03a9..5dc82a8dfb2dbed92f11f09140dbc2e13e9997b8 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -819,9 +819,9 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Drop all incompatible return attributes that cannot be applied to NewFunc // during cloning, so as to allow instruction simplification to reason on the // old state of the function. The original attributes are restored later. - AttributeMask IncompatibleAttrs = - AttributeFuncs::typeIncompatible(OldFunc->getReturnType()); AttributeList Attrs = NewFunc->getAttributes(); + AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible( + OldFunc->getReturnType(), Attrs.getRetAttrs()); NewFunc->removeRetAttrs(IncompatibleAttrs); // As phi-nodes have been now remapped, allow incremental simplification of diff --git a/llvm/lib/Transforms/Utils/CtorUtils.cpp b/llvm/lib/Transforms/Utils/CtorUtils.cpp index 507729bc5ebc060e04f61bc4c1c6fe78e67243d6..968446c4eee1177be8f62d2ab023ee8c8bad7b5d 100644 --- a/llvm/lib/Transforms/Utils/CtorUtils.cpp +++ b/llvm/lib/Transforms/Utils/CtorUtils.cpp @@ -45,9 +45,9 @@ static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemov } // Create the new global and insert it next to the existing list. - GlobalVariable *NGV = - new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(), - CA, "", GCL->getThreadLocalMode()); + GlobalVariable *NGV = new GlobalVariable( + CA->getType(), GCL->isConstant(), GCL->getLinkage(), CA, "", + GCL->getThreadLocalMode(), GCL->getAddressSpace()); GCL->getParent()->insertGlobalVariable(GCL->getIterator(), NGV); NGV->takeName(GCL); diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 55ad2b6d6200036754de70b5b9b0f432cd3661a9..4ad426285ce2f016dd0d7a27a676c0e9c2c3c815 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -34,6 +34,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Argument.h" #include "llvm/IR/AttributeMask.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" @@ -59,6 +60,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" @@ -1358,18 +1360,36 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, auto &Context = CalledFunction->getContext(); // Collect valid attributes for all params. - SmallVector ValidParamAttrs; + SmallVector ValidObjParamAttrs, ValidExactParamAttrs; bool HasAttrToPropagate = false; + // Attributes we can only propagate if the exact parameter is forwarded. + // We can propagate both poison generating and UB generating attributes + // without any extra checks. The only attribute that is tricky to propagate + // is `noundef` (skipped for now) as that can create new UB where previous + // behavior was just using a poison value. + static const Attribute::AttrKind ExactAttrsToPropagate[] = { + Attribute::Dereferenceable, Attribute::DereferenceableOrNull, + Attribute::NonNull, Attribute::Alignment, Attribute::Range}; + for (unsigned I = 0, E = CB.arg_size(); I < E; ++I) { - ValidParamAttrs.emplace_back(AttrBuilder{CB.getContext()}); + ValidObjParamAttrs.emplace_back(AttrBuilder{CB.getContext()}); + ValidExactParamAttrs.emplace_back(AttrBuilder{CB.getContext()}); // Access attributes can be propagated to any param with the same underlying // object as the argument. if (CB.paramHasAttr(I, Attribute::ReadNone)) - ValidParamAttrs.back().addAttribute(Attribute::ReadNone); + ValidObjParamAttrs.back().addAttribute(Attribute::ReadNone); if (CB.paramHasAttr(I, Attribute::ReadOnly)) - ValidParamAttrs.back().addAttribute(Attribute::ReadOnly); - HasAttrToPropagate |= ValidParamAttrs.back().hasAttributes(); + ValidObjParamAttrs.back().addAttribute(Attribute::ReadOnly); + + for (Attribute::AttrKind AK : ExactAttrsToPropagate) { + Attribute Attr = CB.getParamAttr(I, AK); + if (Attr.isValid()) + ValidExactParamAttrs.back().addAttribute(Attr); + } + + HasAttrToPropagate |= ValidObjParamAttrs.back().hasAttributes(); + HasAttrToPropagate |= ValidExactParamAttrs.back().hasAttributes(); } // Won't be able to propagate anything. @@ -1391,22 +1411,60 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, AttributeList AL = NewInnerCB->getAttributes(); for (unsigned I = 0, E = InnerCB->arg_size(); I < E; ++I) { - // Check if the underlying value for the parameter is an argument. - const Value *UnderlyingV = - getUnderlyingObject(InnerCB->getArgOperand(I)); - const Argument *Arg = dyn_cast(UnderlyingV); - if (!Arg) + // It's unsound or requires special handling to propagate + // attributes to byval arguments. Even if CalledFunction + // doesn't e.g. write to the argument (readonly), the call to + // NewInnerCB may write to its by-value copy. + if (NewInnerCB->paramHasAttr(I, Attribute::ByVal)) continue; - if (NewInnerCB->paramHasAttr(I, Attribute::ByVal)) - // It's unsound to propagate memory attributes to byval arguments. - // Even if CalledFunction doesn't e.g. write to the argument, - // the call to NewInnerCB may write to its by-value copy. + // Don't bother propagating attrs to constants. + if (match(NewInnerCB->getArgOperand(I), + llvm::PatternMatch::m_ImmConstant())) continue; - unsigned ArgNo = Arg->getArgNo(); + // Check if the underlying value for the parameter is an argument. + const Argument *Arg = dyn_cast(InnerCB->getArgOperand(I)); + unsigned ArgNo; + if (Arg) { + ArgNo = Arg->getArgNo(); + // For dereferenceable, dereferenceable_or_null, align, etc... + // we don't want to propagate if the existing param has the same + // attribute with "better" constraints. So remove from the + // new AL if the region of the existing param is larger than + // what we can propagate. + AttrBuilder NewAB{ + Context, AttributeSet::get(Context, ValidExactParamAttrs[ArgNo])}; + if (AL.getParamDereferenceableBytes(I) > + NewAB.getDereferenceableBytes()) + NewAB.removeAttribute(Attribute::Dereferenceable); + if (AL.getParamDereferenceableOrNullBytes(I) > + NewAB.getDereferenceableOrNullBytes()) + NewAB.removeAttribute(Attribute::DereferenceableOrNull); + if (AL.getParamAlignment(I).valueOrOne() > + NewAB.getAlignment().valueOrOne()) + NewAB.removeAttribute(Attribute::Alignment); + if (auto ExistingRange = AL.getParamRange(I)) { + if (auto NewRange = NewAB.getRange()) { + ConstantRange CombinedRange = + ExistingRange->intersectWith(*NewRange); + NewAB.removeAttribute(Attribute::Range); + NewAB.addRangeAttr(CombinedRange); + } + } + AL = AL.addParamAttributes(Context, I, NewAB); + } else { + // Check if the underlying value for the parameter is an argument. + const Value *UnderlyingV = + getUnderlyingObject(InnerCB->getArgOperand(I)); + Arg = dyn_cast(UnderlyingV); + if (!Arg) + continue; + ArgNo = Arg->getArgNo(); + } + // If so, propagate its access attributes. - AL = AL.addParamAttributes(Context, I, ValidParamAttrs[ArgNo]); + AL = AL.addParamAttributes(Context, I, ValidObjParamAttrs[ArgNo]); // We can have conflicting attributes from the inner callsite and // to-be-inlined callsite. In that case, choose the most // restrictive. @@ -2057,7 +2115,6 @@ void llvm::updateProfileCallee( static void inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, const SmallVectorImpl &Returns) { - Module *Mod = CB.getModule(); assert(objcarc::isRetainOrClaimRV(RVCallKind) && "unexpected ARC function"); bool IsRetainRV = RVCallKind == objcarc::ARCInstKind::RetainRV, IsUnsafeClaimRV = !IsRetainRV; @@ -2089,9 +2146,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // call. if (IsUnsafeClaimRV) { Builder.SetInsertPoint(II); - Function *IFn = - Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_release); - Builder.CreateCall(IFn, RetOpnd, ""); + Builder.CreateIntrinsic(Intrinsic::objc_release, {}, RetOpnd); } II->eraseFromParent(); InsertRetainCall = false; @@ -2125,9 +2180,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // matching autoreleaseRV or an annotated call in the callee. Emit a call // to objc_retain. Builder.SetInsertPoint(RI); - Function *IFn = - Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_retain); - Builder.CreateCall(IFn, RetOpnd, ""); + Builder.CreateIntrinsic(Intrinsic::objc_retain, {}, RetOpnd); } } } @@ -3062,8 +3115,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, else Builder.CreateRet(NewDeoptCall); // Since the ret type is changed, remove the incompatible attributes. - NewDeoptCall->removeRetAttrs( - AttributeFuncs::typeIncompatible(NewDeoptCall->getType())); + NewDeoptCall->removeRetAttrs(AttributeFuncs::typeIncompatible( + NewDeoptCall->getType(), NewDeoptCall->getRetAttributes())); } // Leave behind the normal returns so we can merge control flow. diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 760f1619e030c36a92d3ab1976d249bc9b2faf74..3cbde39b30b4e41841833b96a0713b18e9583e26 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -206,13 +206,11 @@ PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations) // G(%y) = Unknown otherwise (including phi not in header block) PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) { // If we already know the answer, take it from the map. - auto I = IterationsToInvariance.find(&V); - if (I != IterationsToInvariance.end()) - return I->second; - - // Place Unknown to map to avoid infinite recursion. Such + // Otherwise, place Unknown to map to avoid infinite recursion. Such // cycles can never stop on an invariant. - IterationsToInvariance[&V] = Unknown; + auto [I, Inserted] = IterationsToInvariance.try_emplace(&V, Unknown); + if (!Inserted) + return I->second; if (L.isLoopInvariant(&V)) // Loop invariant so known at start. diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 77abf160dc70f9e349162c62d2c9d83e7393fa48..cccb9dae17df6a7ac63b90f4fe74431b29c32ed2 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -268,12 +268,11 @@ bool isLifetimeIntrinsic(Value *V) { Value *readRegister(IRBuilder<> &IRB, StringRef Name) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - Function *ReadRegister = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::read_register, IRB.getIntPtrTy(M->getDataLayout())); MDNode *MD = MDNode::get(M->getContext(), {MDString::get(M->getContext(), Name)}); Value *Args[] = {MetadataAsValue::get(M->getContext(), MD)}; - return IRB.CreateCall(ReadRegister, Args); + return IRB.CreateIntrinsic(Intrinsic::read_register, + IRB.getIntPtrTy(M->getDataLayout()), Args); } Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) { @@ -287,12 +286,10 @@ Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) { Value *getFP(IRBuilder<> &IRB) { Function *F = IRB.GetInsertBlock()->getParent(); Module *M = F->getParent(); - auto *GetStackPointerFn = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::frameaddress, - IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace())); return IRB.CreatePtrToInt( - IRB.CreateCall(GetStackPointerFn, - {Constant::getNullValue(IRB.getInt32Ty())}), + IRB.CreateIntrinsic(Intrinsic::frameaddress, + IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()), + {Constant::getNullValue(IRB.getInt32Ty())}), IRB.getIntPtrTy(M->getDataLayout())); } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 101d60525f41619fa486cbf4b38c80b346b35eb6..c65710ea7551ac9e2d5d909c647ffc19bb32d2f2 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -630,10 +630,7 @@ private: } // Add U as additional user of V. - void addAdditionalUser(Value *V, User *U) { - auto Iter = AdditionalUsers.insert({V, {}}); - Iter.first->second.insert(U); - } + void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); } // Mark I's users as changed, including AdditionalUsers. void markUsersAsChanged(Value *I) { diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index bc619e5098e09ff90b4b8999eb7757824afc44fe..39da38e4918176c6297226d33fbc9bf513f91ffa 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -681,7 +681,21 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) { SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true); } - Value *RHS = expand(S->getRHS()); + const SCEV *RHSExpr = S->getRHS(); + Value *RHS = expand(RHSExpr); + if (SafeUDivMode) { + bool GuaranteedNotPoison = + ScalarEvolution::isGuaranteedNotToBePoison(RHSExpr); + if (!GuaranteedNotPoison) + RHS = Builder.CreateFreeze(RHS); + + // We need an umax if either RHSExpr is not known to be zero, or if it is + // not guaranteed to be non-poison. In the later case, the frozen poison may + // be 0. + if (!SE.isKnownNonZero(RHSExpr) || !GuaranteedNotPoison) + RHS = Builder.CreateIntrinsic(RHS->getType(), Intrinsic::umax, + {RHS, ConstantInt::get(RHS->getType(), 1)}); + } return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap, /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS())); } @@ -1376,11 +1390,14 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) { Value *SCEVExpander::expandMinMaxExpr(const SCEVNAryExpr *S, Intrinsic::ID IntrinID, Twine Name, bool IsSequential) { + bool PrevSafeMode = SafeUDivMode; + SafeUDivMode |= IsSequential; Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); Type *Ty = LHS->getType(); if (IsSequential) LHS = Builder.CreateFreeze(LHS); for (int i = S->getNumOperands() - 2; i >= 0; --i) { + SafeUDivMode = (IsSequential && i != 0) || PrevSafeMode; Value *RHS = expand(S->getOperand(i)); if (IsSequential && i != 0) RHS = Builder.CreateFreeze(RHS); @@ -1395,6 +1412,7 @@ Value *SCEVExpander::expandMinMaxExpr(const SCEVNAryExpr *S, } LHS = Sel; } + SafeUDivMode = PrevSafeMode; return LHS; } @@ -2139,10 +2157,9 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, MulV = TruncTripCount; OfMul = ConstantInt::getFalse(MulV->getContext()); } else { - auto *MulF = Intrinsic::getOrInsertDeclaration( - Loc->getModule(), Intrinsic::umul_with_overflow, Ty); - CallInst *Mul = - Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); + CallInst *Mul = Builder.CreateIntrinsic(Intrinsic::umul_with_overflow, Ty, + {AbsStep, TruncTripCount}, + /*FMFSource=*/nullptr, "mul"); MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 566ae2cf1936e9c83c7be4ae388504ff65794191..72228b445a8b6eb2eeab207a606c79ee66845c0f 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7243,7 +7243,7 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, for (auto Case : SI->cases()) { auto *Orig = Case.getCaseValue(); - auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base); + auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base, true); Case.setValue(cast(ConstantInt::get(Ty, Sub.lshr(Shift)))); } return true; diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index db2acb9eed0938eaacf9673a3420d386b05a18b1..79e91ad097cf00e00345ba0198ce480b59b7f890 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -342,7 +342,13 @@ static Value *copyFlags(const CallInst &Old, Value *New) { static Value *mergeAttributesAndFlags(CallInst *NewCI, const CallInst &Old) { NewCI->setAttributes(AttributeList::get( NewCI->getContext(), {NewCI->getAttributes(), Old.getAttributes()})); - NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible( + NewCI->getType(), NewCI->getRetAttributes())); + for (unsigned I = 0; I < NewCI->arg_size(); ++I) + NewCI->removeParamAttrs( + I, AttributeFuncs::typeIncompatible(NewCI->getArgOperand(I)->getType(), + NewCI->getParamAttributes(I))); + return copyFlags(Old, NewCI); } @@ -1958,10 +1964,9 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, // g((double) float) -> (double) gf(float) Value *R; if (IsIntrinsic) { - Module *M = CI->getModule(); Intrinsic::ID IID = CalleeFn->getIntrinsicID(); - Function *Fn = Intrinsic::getOrInsertDeclaration(M, IID, B.getFloatTy()); - R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]); + R = isBinary ? B.CreateIntrinsic(IID, B.getFloatTy(), V) + : B.CreateIntrinsic(IID, B.getFloatTy(), V[0]); } else { AttributeList CalleeAttrs = CalleeFn->getAttributes(); R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], TLI, CalleeName, B, diff --git a/llvm/lib/Transforms/Utils/VecCloneVP.cpp b/llvm/lib/Transforms/Utils/VecCloneVP.cpp index 6c08bf488376715889666b0f6d3318108aaac4d5..74b0cc7dd5aaca43e8cb41a96265b7e6f5286de9 100644 --- a/llvm/lib/Transforms/Utils/VecCloneVP.cpp +++ b/llvm/lib/Transforms/Utils/VecCloneVP.cpp @@ -295,16 +295,16 @@ Function *VecCloneVPPass::cloneFunction(Module &M, Function &F, // does not apply to its vector counterpart). This must be done after cloning // the function because CloneFunctionInto() transfers parameter attributes // from the original parameters in the Vmap. - uint64_t Idx = 0; - for (auto &Arg : Clone->args()) { + for (auto [Idx, Arg] : llvm::enumerate(Clone->args())) { Type *ArgType = Arg.getType(); - auto AM = AttributeFuncs::typeIncompatible(ArgType); + AttributeMask AM = + AttributeFuncs::typeIncompatible(ArgType, Arg.getAttributes()); Clone->removeParamAttrs(Idx, AM); - ++Idx; } - auto AM = AttributeFuncs::typeIncompatible(ReturnType); - Clone->removeRetAttrs(AM); + AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible( + ReturnType, Clone->getAttributes().getRetAttrs()); + Clone->removeRetAttrs(IncompatibleAttrs); // Add zeroext attribute to VL operand. Clone->addParamAttr(Clone->arg_size() - 1, Attribute::ZExt); diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 9c2e7c1e0c5bc7b146ec89675bf1d82be7dfc13e..f4e98e576379a4ed90582376972322e9f78fbec0 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -6,7 +6,9 @@ add_llvm_component_library(LLVMVectorize SandboxVectorizer/DependencyGraph.cpp SandboxVectorizer/Interval.cpp SandboxVectorizer/Passes/BottomUpVec.cpp + SandboxVectorizer/Passes/RegionsFromMetadata.cpp SandboxVectorizer/SandboxVectorizer.cpp + SandboxVectorizer/SandboxVectorizerPassBuilder.cpp SandboxVectorizer/SeedCollector.cpp SLPVectorizer.cpp Vectorize.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 91befc5a6aa5b5adbc4d32327d86aa620bfdb49b..b08d70a0354d19bb6350740adb96b7a81ada60fa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -680,9 +680,6 @@ protected: /// there can be multiple exiting edges reaching this block. BasicBlock *LoopExitBlock; - /// The scalar loop body. - BasicBlock *LoopScalarBody; - /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; @@ -861,8 +858,7 @@ protected: }; } // end namespace llvm -/// Look for a meaningful debug location on the instruction or it's -/// operands. +/// Look for a meaningful debug location on the instruction or its operands. static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { if (!I) return DebugLoc(); @@ -1910,7 +1906,7 @@ public: /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are - /// un-linked from the IR and is added back during vector code generation. If + /// un-linked from the IR and are added back during vector code generation. If /// there is no vector code generation, the check blocks are removed /// completely. void create(Loop *L, LoopVectorizationLegality &LVL, @@ -2678,7 +2674,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { } void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { - LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr @@ -2728,7 +2723,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( } } - // Create phi nodes to merge from the backedge-taken check block. + // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", LoopScalarPreHeader->getFirstNonPHIIt()); @@ -3162,7 +3157,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // Set/update profile weights for the vector and remainder loops as original // loop iterations are now distributed among them. Note that original loop - // represented by LoopScalarBody becomes remainder loop after vectorization. + // becomes the scalar remainder loop after vectorization. // // For cases like foldTailByMasking() and requiresScalarEpiloque() we may // end up getting slightly roughened result but that should be OK since @@ -3170,12 +3165,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. // - // For scalable vectorization we can't know at compile time how many iterations - // of the loop are handled in one vector iteration, so instead assume a pessimistic - // vscale of '1'. - Loop *ScalarLoop = LI->getLoopFor(LoopScalarBody); + // For scalable vectorization we can't know at compile time how many + // iterations of the loop are handled in one vector iteration, so instead + // assume a pessimistic vscale of '1'. Loop *VectorLoop = LI->getLoopFor(HeaderBB); - setProfileInfoAfterUnrolling(ScalarLoop, VectorLoop, ScalarLoop, + setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, VF.getKnownMinValue() * UF); } @@ -3219,7 +3213,8 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // We can't sink an instruction if it is a phi node, is not in the loop, // may have side effects or may read from memory. - // TODO Could dor more granular checking to allow sinking a load past non-store instructions. + // TODO: Could do more granular checking to allow sinking + // a load past non-store instructions. if (!I || isa(I) || !VectorLoop->contains(I) || I->mayHaveSideEffects() || I->mayReadFromMemory()) continue; @@ -3637,9 +3632,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // (2) Add to the worklist all bitcast and getelementptr instructions used by // memory accesses requiring a scalar use. The pointer operands of loads and - // stores will be scalar as long as the memory accesses is not a gather or - // scatter operation. The value operand of a store will remain scalar if the - // store is scalarized. + // stores will be scalar unless the operation is a gather or scatter. + // The value operand of a store will remain scalar if the store is scalarized. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (auto *Load = dyn_cast(&I)) { @@ -3916,7 +3910,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( assert(Group && "Must have a group."); unsigned InterleaveFactor = Group->getFactor(); - // If the instruction's allocated size doesn't equal it's type size, it + // If the instruction's allocated size doesn't equal its type size, it // requires padding and will be scalarized. auto &DL = I->getDataLayout(); auto *ScalarTy = getLoadStoreType(I); @@ -4016,11 +4010,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { assert(VF.isVector() && !Uniforms.contains(VF) && "This function should not be visited twice for the same VF"); - // Visit the list of Uniforms. If we'll not find any uniform value, we'll - // not analyze again. Uniforms.count(VF) will return 1. + // Visit the list of Uniforms. If we find no uniform value, we won't + // analyze again. Uniforms.count(VF) will return 1. Uniforms[VF].clear(); - // We now know that the loop is vectorizable! + // Now we know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after // vectorization. @@ -4067,7 +4061,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { auto PrevVF = VF.divideCoefficientBy(2); // Return true if all lanes perform the same memory operation, and we can - // thus chose to execute only one. + // thus choose to execute only one. auto IsUniformMemOpUse = [&](Instruction *I) { // If the value was already known to not be uniform for the previous // (smaller VF), it cannot be uniform for the larger VF. @@ -4475,7 +4469,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { - // TODO: It may by useful to do since it's still likely to be dynamically + // TODO: It may be useful to do since it's still likely to be dynamically // uniform if the target can skip. reportVectorizationFailure( "Not inserting runtime ptr check for divergent target", @@ -4557,7 +4551,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"); // Note: There is no need to invalidate any cost modeling decisions here, as - // non where taken so far. + // none were taken so far. InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } @@ -8700,7 +8694,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( BasicBlock *Bypass, BasicBlock *Insert) { assert(EPI.TripCount && - "Expected trip count to have been safed in the first pass."); + "Expected trip count to have been saved in the first pass."); assert( (!isa(EPI.TripCount) || DT->dominates(cast(EPI.TripCount)->getParent(), Insert)) && diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b6ad94ec7f03d0bad337ace2fa5075bef5c84346..3d07a97b65eea708d749caed9a5cec60c35ddc5f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -291,8 +291,6 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, if (NumParts == 0 || NumParts >= Sz) return bit_floor(Sz); unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts)); - if (RegVF > Sz) - return bit_floor(Sz); return (Sz / RegVF) * RegVF; } @@ -18218,13 +18216,22 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2)) break; - ArrayRef Ops = VL.slice(I, ActualVF); - // Check that a previous iteration of this loop did not delete the Value. - if (llvm::any_of(Ops, [&R](Value *V) { - auto *I = dyn_cast(V); - return I && R.isDeleted(I); - })) - continue; + SmallVector Ops(ActualVF, nullptr); + unsigned Idx = 0; + for (Value *V : VL.drop_front(I)) { + // Check that a previous iteration of this loop did not delete the + // Value. + if (auto *Inst = dyn_cast(V); + !Inst || !R.isDeleted(Inst)) { + Ops[Idx] = V; + ++Idx; + if (Idx == ActualVF) + break; + } + } + // Not enough vectorizable instructions - exit. + if (Idx != ActualVF) + break; LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations " << "\n"); @@ -18292,7 +18299,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { // Vectorize in current basic block only. auto *Op0 = dyn_cast(I->getOperand(0)); auto *Op1 = dyn_cast(I->getOperand(1)); - if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) + if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P || + R.isDeleted(Op0) || R.isDeleted(Op1)) return false; // First collect all possible candidates @@ -18305,18 +18313,18 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (A && B && B->hasOneUse()) { auto *B0 = dyn_cast(B->getOperand(0)); auto *B1 = dyn_cast(B->getOperand(1)); - if (B0 && B0->getParent() == P) + if (B0 && B0->getParent() == P && !R.isDeleted(B0)) Candidates.emplace_back(A, B0); - if (B1 && B1->getParent() == P) + if (B1 && B1->getParent() == P && !R.isDeleted(B1)) Candidates.emplace_back(A, B1); } // Try to skip A. if (B && A && A->hasOneUse()) { auto *A0 = dyn_cast(A->getOperand(0)); auto *A1 = dyn_cast(A->getOperand(1)); - if (A0 && A0->getParent() == P) + if (A0 && A0->getParent() == P && !R.isDeleted(A0)) Candidates.emplace_back(A0, B); - if (A1 && A1->getParent() == P) + if (A1 && A1->getParent() == P && !R.isDeleted(A1)) Candidates.emplace_back(A1, B); } @@ -19069,8 +19077,7 @@ public: unsigned ReduxWidth = NumReducedVals; if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1)) - ReduxWidth = getFloorFullVectorNumberOfElements( - *TTI, Candidates.front()->getType(), ReduxWidth); + ReduxWidth = bit_floor(ReduxWidth); ReduxWidth = std::min(ReduxWidth, MaxElts); unsigned Start = 0; @@ -19078,7 +19085,10 @@ public: // Restarts vectorization attempt with lower vector factor. unsigned PrevReduxWidth = ReduxWidth; bool CheckForReusedReductionOpsLocal = false; - auto AdjustReducedVals = [&](bool IgnoreVL = false) { + auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, + &CheckForReusedReductionOpsLocal, + &PrevReduxWidth, &V, + &IgnoreList](bool IgnoreVL = false) { bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { // Check if any of the reduction ops are gathered. If so, worth @@ -19089,10 +19099,7 @@ public: if (Pos < NumReducedVals - ReduxWidth + 1) return IsAnyRedOpGathered; Pos = Start; - --ReduxWidth; - if (ReduxWidth > 1) - ReduxWidth = getFloorFullVectorNumberOfElements( - *TTI, Candidates.front()->getType(), ReduxWidth); + ReduxWidth = bit_ceil(ReduxWidth) / 2; return IsAnyRedOpGathered; }; bool AnyVectorized = false; @@ -19324,10 +19331,7 @@ public: } Pos += ReduxWidth; Start = Pos; - ReduxWidth = NumReducedVals - Pos; - if (ReduxWidth > 1) - ReduxWidth = getFloorFullVectorNumberOfElements( - *TTI, Candidates.front()->getType(), NumReducedVals - Pos); + ReduxWidth = llvm::bit_floor(NumReducedVals - Pos); AnyVectorized = true; } if (OptReusedScalars && !AnyVectorized) { @@ -19778,16 +19782,16 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl &BuildVectorOpds, SmallVectorImpl &InsertElts, - unsigned OperandOffset) { + unsigned OperandOffset, const BoUpSLP &R) { do { Value *InsertedOperand = LastInsertInst->getOperand(1); std::optional OperandIndex = getElementIndex(LastInsertInst, OperandOffset); - if (!OperandIndex) + if (!OperandIndex || R.isDeleted(LastInsertInst)) return; if (isa(InsertedOperand)) { findBuildAggregate_rec(cast(InsertedOperand), TTI, - BuildVectorOpds, InsertElts, *OperandIndex); + BuildVectorOpds, InsertElts, *OperandIndex, R); } else { BuildVectorOpds[*OperandIndex] = InsertedOperand; @@ -19816,7 +19820,8 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst, static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl &BuildVectorOpds, - SmallVectorImpl &InsertElts) { + SmallVectorImpl &InsertElts, + const BoUpSLP &R) { assert((isa(LastInsertInst) || isa(LastInsertInst)) && @@ -19831,7 +19836,8 @@ static bool findBuildAggregate(Instruction *LastInsertInst, BuildVectorOpds.resize(*AggregateSize); InsertElts.resize(*AggregateSize); - findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0); + findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, + R); llvm::erase(BuildVectorOpds, nullptr); llvm::erase(InsertElts, nullptr); if (BuildVectorOpds.size() >= 2) @@ -19952,7 +19958,7 @@ static bool isReductionCandidate(Instruction *I) { } bool SLPVectorizerPass::vectorizeHorReduction( - PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, + PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, SmallVectorImpl &PostponedInsts) { if (!ShouldVectorizeHor) return false; @@ -19985,7 +19991,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( Stack.emplace(SelectRoot(), 0); SmallPtrSet VisitedInstrs; bool Res = false; - auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * { + auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * { if (R.isAnalyzedReductionRoot(Inst)) return nullptr; if (!isReductionCandidate(Inst)) @@ -20052,10 +20058,9 @@ bool SLPVectorizerPass::vectorizeHorReduction( } bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root, - BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI) { + BasicBlock *BB, BoUpSLP &R) { SmallVector PostponedInsts; - bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts); + bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts); Res |= tryToVectorize(PostponedInsts, R); return Res; } @@ -20077,7 +20082,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, SmallVector BuildVectorOpds; SmallVector BuildVectorInsts; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R)) return false; if (MaxVFOnly && BuildVectorOpds.size() == 2) { @@ -20099,7 +20104,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector BuildVectorInsts; SmallVector BuildVectorOpds; SmallVector Mask; - if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) || (llvm::all_of(BuildVectorOpds, IsaPred) && isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; @@ -20320,7 +20325,7 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range CmpInsts, continue; for (Value *Op : I->operands()) if (auto *RootOp = dyn_cast(Op)) - Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI); + Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R); } // Try to vectorize operands as vector bundles. for (CmpInst *I : CmpInsts) { @@ -20387,7 +20392,7 @@ bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions, // pass2 - try to vectorize reductions only if (R.isDeleted(I)) continue; - OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts); + OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts); if (R.isDeleted(I) || isa(I)) continue; // pass3 - try to match and vectorize a buildvector sequence. @@ -20647,7 +20652,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (P->getNumIncomingValues() == 2) { // Try to match and vectorize a horizontal reduction. Instruction *Root = getReductionInstr(DT, P, BB, LI); - if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) { + if (Root && vectorizeRootInstruction(P, Root, BB, R)) { Changed = true; It = BB->begin(); E = BB->end(); @@ -20669,8 +20674,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // vectorization. if (auto *PI = dyn_cast(P->getIncomingValue(I)); PI && !IsInPostProcessInstrs(PI)) { - bool Res = vectorizeRootInstruction(nullptr, PI, - P->getIncomingBlock(I), R, TTI); + bool Res = + vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R); Changed |= Res; if (Res && R.isDeleted(P)) { It = BB->begin(); @@ -20704,7 +20709,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (auto *VI = dyn_cast(V); VI && !IsInPostProcessInstrs(VI)) // Try to match and vectorize a horizontal reduction. - OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI); + OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R); } } // Start vectorization of post-process list of instructions from the diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 77198f932a3ec009009d4ede658cd9cc1ced7635..6171d5e52b5869e357e9e5e01cdb5054ba07caf0 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -7,43 +7,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" + #include "llvm/ADT/SmallVector.h" #include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h" namespace llvm::sandboxir { -static cl::opt - PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden, - cl::desc("Prints the pass pipeline and returns.")); - -/// A magic string for the default pass pipeline. -static const char *DefaultPipelineMagicStr = "*"; - -static cl::opt UserDefinedPassPipeline( - "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden, - cl::desc("Comma-separated list of vectorizer passes. If not set " - "we run the predefined pipeline.")); - -static std::unique_ptr createRegionPass(StringRef Name) { -#define REGION_PASS(NAME, CREATE_PASS) \ - if (Name == NAME) \ - return std::make_unique(CREATE_PASS); -#include "PassRegistry.def" - return nullptr; -} - -BottomUpVec::BottomUpVec() : FunctionPass("bottom-up-vec"), RPM("rpm") { - // Create a pipeline to be run on each Region created by BottomUpVec. - if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { - // TODO: Add default passes to RPM. - } else { - // Create the user-defined pipeline. - RPM.setPassPipeline(UserDefinedPassPipeline, createRegionPass); - } -} +BottomUpVec::BottomUpVec(StringRef Pipeline) + : FunctionPass("bottom-up-vec"), + RPM("rpm", Pipeline, SandboxVectorizerPassBuilder::createRegionPass) {} // TODO: This is a temporary function that returns some seeds. // Replace this with SeedCollector's function when it lands. @@ -82,11 +56,6 @@ void BottomUpVec::vectorizeRec(ArrayRef Bndl) { void BottomUpVec::tryVectorize(ArrayRef Bndl) { vectorizeRec(Bndl); } bool BottomUpVec::runOnFunction(Function &F) { - if (PrintPassPipeline) { - RPM.printPipeline(outs()); - return false; - } - Change = false; // TODO: Start from innermost BBs first for (auto &BB : F) { diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def index bbb0dcba1ea51642ab4df1ed2600d381b92dc250..0dc72842f1abe0ed6ae42b1b210c1460e3b4d428 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def @@ -14,9 +14,19 @@ // NOTE: NO INCLUDE GUARD DESIRED! #ifndef REGION_PASS -#define REGION_PASS(NAME, CREATE_PASS) +#define REGION_PASS(NAME, CLASS_NAME) #endif -REGION_PASS("null", NullPass()) +REGION_PASS("null", ::llvm::sandboxir::NullPass) +REGION_PASS("print-instruction-count", ::llvm::sandboxir::PrintInstructionCount) #undef REGION_PASS + +#ifndef FUNCTION_PASS_WITH_PARAMS +#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS_NAME) +#endif + +FUNCTION_PASS_WITH_PARAMS("bottom-up-vec", ::llvm::sandboxir::BottomUpVec) +FUNCTION_PASS_WITH_PARAMS("regions-from-metadata", ::llvm::sandboxir::RegionsFromMetadata) + +#undef FUNCTION_PASS_WITH_PARAMS diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5887d5e8bc2683c66cfd1bc19918fa2e7ffc0be4 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp @@ -0,0 +1,29 @@ +//===- RegionsFromMetadata.cpp - A helper to test RegionPasses -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h" + +#include "llvm/SandboxIR/Region.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h" + +namespace llvm::sandboxir { + +RegionsFromMetadata::RegionsFromMetadata(StringRef Pipeline) + : FunctionPass("regions-from-metadata"), + RPM("rpm", Pipeline, SandboxVectorizerPassBuilder::createRegionPass) {} + +bool RegionsFromMetadata::runOnFunction(Function &F) { + SmallVector> Regions = + sandboxir::Region::createRegionsFromMD(F); + for (auto &R : Regions) { + RPM.runOnRegion(*R); + } + return false; +} + +} // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ba4899cc624e9957a3bb61d7ffaa14c1cf7dc046..c68f9482e337dd5c8633cc0796289e6443e4897a 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -9,14 +9,39 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/SandboxIR/Constant.h" -#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h" using namespace llvm; #define SV_NAME "sandbox-vectorizer" #define DEBUG_TYPE SV_NAME -SandboxVectorizerPass::SandboxVectorizerPass() = default; +static cl::opt + PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden, + cl::desc("Prints the pass pipeline and returns.")); + +/// A magic string for the default pass pipeline. +static const char *DefaultPipelineMagicStr = "*"; + +static cl::opt UserDefinedPassPipeline( + "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden, + cl::desc("Comma-separated list of vectorizer passes. If not set " + "we run the predefined pipeline.")); + +SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") { + if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { + // TODO: Add region passes to the default pipeline. + FPM.setPassPipeline( + "bottom-up-vec<>", + sandboxir::SandboxVectorizerPassBuilder::createFunctionPass); + } else { + // Create the user-defined pipeline. + FPM.setPassPipeline( + UserDefinedPassPipeline, + sandboxir::SandboxVectorizerPassBuilder::createFunctionPass); + } +} SandboxVectorizerPass::SandboxVectorizerPass(SandboxVectorizerPass &&) = default; @@ -37,6 +62,11 @@ PreservedAnalyses SandboxVectorizerPass::run(Function &F, } bool SandboxVectorizerPass::runImpl(Function &LLVMF) { + if (PrintPassPipeline) { + FPM.printPipeline(outs()); + return false; + } + // If the target claims to have no vector registers early return. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { LLVM_DEBUG(dbgs() << "SBVec: Target has no vector registers, return.\n"); @@ -52,5 +82,5 @@ bool SandboxVectorizerPass::runImpl(Function &LLVMF) { // Create SandboxIR for LLVMF and run BottomUpVec on it. sandboxir::Context Ctx(LLVMF.getContext()); sandboxir::Function &F = *Ctx.createFunction(&LLVMF); - return BottomUpVecPass.runOnFunction(F); + return FPM.runOnFunction(F); } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5ecf7b2ed0d258e1c39ac405a4d92016f53a12ac --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp @@ -0,0 +1,32 @@ +#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h" + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h" + +namespace llvm::sandboxir { + +std::unique_ptr +SandboxVectorizerPassBuilder::createRegionPass(StringRef Name, StringRef Args) { +#define REGION_PASS(NAME, CLASS_NAME) \ + if (Name == NAME) { \ + assert(Args.empty() && "Unexpected arguments for pass '" NAME "'."); \ + return std::make_unique(); \ + } +// TODO: Support region passes with params. +#include "Passes/PassRegistry.def" + return nullptr; +} + +std::unique_ptr +SandboxVectorizerPassBuilder::createFunctionPass(StringRef Name, + StringRef Args) { +#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS_NAME) \ + if (Name == NAME) \ + return std::make_unique(Args); +#include "Passes/PassRegistry.def" + return nullptr; +} + +} // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp index 20df9e344b61cda8dc7d5068ed9552dea81282c2..66fac080a7b7ccec2bc943f40c6b22464e847516 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp @@ -112,21 +112,23 @@ template void SeedContainer::insert(LoadInst *); template void SeedContainer::insert(StoreInst *); #ifndef NDEBUG -void SeedContainer::dump() const { +void SeedContainer::print(raw_ostream &OS) const { for (const auto &Pair : Bundles) { auto [I, Ty, Opc] = Pair.first; const auto &SeedsVec = Pair.second; std::string RefType = dyn_cast(I) ? "Load" : dyn_cast(I) ? "Store" : "Other"; - dbgs() << "[Inst=" << *I << " Ty=" << Ty << " " << RefType << "]\n"; + OS << "[Inst=" << *I << " Ty=" << Ty << " " << RefType << "]\n"; for (const auto &SeedPtr : SeedsVec) { - SeedPtr->dump(dbgs()); - dbgs() << "\n"; + SeedPtr->dump(OS); + OS << "\n"; } } - dbgs() << "\n"; + OS << "\n"; } + +LLVM_DUMP_METHOD void SeedContainer::dump() const { print(dbgs()); } #endif // NDEBUG } // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index d9254b61ed6d7c0df2c2cff10b32336752736a4b..4150caf95c33a95baf459c1524216b2f23c592f2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1585,7 +1585,8 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, void VPSlotTracker::assignName(const VPValue *V) { assert(!VPValue2Name.contains(V) && "VPValue already has a name!"); auto *UV = V->getUnderlyingValue(); - if (!UV) { + auto *VPI = dyn_cast_or_null(V->getDefiningRecipe()); + if (!UV && !(VPI && !VPI->getName().empty())) { VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str(); NextSlot++; return; @@ -1594,10 +1595,15 @@ void VPSlotTracker::assignName(const VPValue *V) { // Use the name of the underlying Value, wrapped in "ir<>", and versioned by // appending ".Number" to the name if there are multiple uses. std::string Name; - raw_string_ostream S(Name); - UV->printAsOperand(S, false); + if (UV) { + raw_string_ostream S(Name); + UV->printAsOperand(S, false); + } else + Name = VPI->getName(); + assert(!Name.empty() && "Name cannot be empty."); - std::string BaseName = (Twine("ir<") + Name + Twine(">")).str(); + StringRef Prefix = UV ? "ir<" : "vp<%"; + std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str(); // First assign the base name for V. const auto &[A, _] = VPValue2Name.insert({V, BaseName}); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 72ff5a7ee1f23d71fa998d62f00a9189d7065deb..98cb78c60b68fa603793b26e9cc2673f44e453fe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -982,6 +982,11 @@ public: /// Return the cost of this VPSingleDefRecipe. InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print this VPSingleDefRecipe to dbgs() (for debugging). + LLVM_DUMP_METHOD void dump() const; +#endif }; /// Class to record LLVM IR flag for a recipe along with it. @@ -1460,6 +1465,9 @@ public: /// Returns true if this VPInstruction's operands are single scalars and the /// result is also a single scalar. bool isSingleScalar() const; + + /// Returns the symbolic name assigned to the VPInstruction. + StringRef getName() const { return Name; } }; /// A recipe to wrap on original IR instruction not to be modified during @@ -1769,6 +1777,16 @@ public: MayWriteToMemory(CI.mayWriteToMemory()), MayHaveSideEffects(CI.mayHaveSideEffects()) {} + VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, + ArrayRef CallArguments, Type *Ty, + bool MayReadFromMemory, bool MayWriteToMemory, + bool MayHaveSideEffects, DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), + VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), + MayReadFromMemory(MayReadFromMemory), + MayWriteToMemory(MayWriteToMemory), + MayHaveSideEffects(MayHaveSideEffects) {} + ~VPWidenIntrinsicRecipe() override = default; VPWidenIntrinsicRecipe *clone() override { @@ -1806,6 +1824,8 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + bool onlyFirstLaneUsed(const VPValue *Op) const override; }; /// A recipe for widening Call instructions using library calls. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index cc76726c81ff96c194b66969fc8783edaf44df02..72b6ff1e7ccd6b26f515a7fccb20ee8e363171a2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -61,6 +61,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case Instruction::ICmp: case VPInstruction::ActiveLaneMask: return inferScalarType(R->getOperand(1)); + case VPInstruction::ExplicitVectorLength: + return Type::getIntNTy(Ctx, 32); case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::Not: return SetResultTyFromOp(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 497a86c4beb7115e6027a11fc02c98e1e1a348a9..43d363d75be270f27200c4031a0a846cb40cb437 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -86,7 +86,6 @@ bool VPRecipeBase::mayWriteToMemory() const { return !cast(this) ->getCalledScalarFunction() ->onlyReadsMemory(); - case VPWidenIntrinsicSC: return cast(this)->mayWriteToMemory(); case VPBranchOnMaskSC: case VPScalarIVStepsSC: @@ -95,6 +94,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: case VPWidenGEPSC: @@ -169,6 +169,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: case VPWidenGEPSC: @@ -222,6 +223,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPReductionEVLSC: case VPReductionSC: case VPScalarIVStepsSC: + case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: case VPWidenGEPSC: @@ -417,6 +419,10 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { return Res; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPSingleDefRecipe::dump() const { VPDef::dump(); } +#endif + template VPValue * VPUnrollPartAccessor::getUnrollPartOperand(VPUser &U) const { @@ -1271,6 +1277,14 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } +bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { + assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); + // Vector predication intrinsics only demand the the first lane the last + // operand (the EVL operand). + return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && + Op == getOperand(getNumOperands() - 1); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -3900,7 +3914,33 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - return Ctx.getLegacyCost(IG->getInsertPos(), VF); + Instruction *I = getInsertPos(); + Type *ValTy = Ctx.Types.inferScalarType( + getNumDefinedValues() > 0 ? getVPValue(0) : getStoredValues()[0]); + auto *VectorTy = cast(ToVectorTy(ValTy, VF)); + unsigned AS = getLoadStoreAddressSpace(I); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + unsigned InterleaveFactor = IG->getFactor(); + auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); + + // Holds the indices of existing members in the interleaved group. + SmallVector Indices; + for (unsigned IF = 0; IF < InterleaveFactor; IF++) + if (IG->getMember(IF)) + Indices.push_back(IF); + + // Calculate the cost of the whole interleaved group. + InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, IG->getFactor(), Indices, IG->getAlign(), AS, + CostKind, getMask(), NeedsMaskForGaps); + + if (!IG->isReverse()) + return Cost; + + return Cost + IG->getNumMembers() * + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, + VectorTy, std::nullopt, CostKind, 0); } void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 662a96ea5ccbd42b9953769177ddb297805bc366..443aa7686ea3c473cc5235deb76a9ccf19408d12 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1824,6 +1824,7 @@ void VPlanTransforms::addActiveLaneMask( /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { SmallVector HeaderMasks = collectAllHeaderMasks(Plan); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = dyn_cast(U); @@ -1855,6 +1856,14 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPValue *NewMask = GetNewMask(Red->getCondOp()); return new VPReductionEVLRecipe(*Red, EVL, NewMask); }) + .Case([&](VPWidenSelectRecipe *Sel) { + SmallVector Ops(Sel->operands()); + Ops.push_back(&EVL); + return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops, + TypeInfo.inferScalarType(Sel), + false, false, false); + }) + .Default([&](VPRecipeBase *R) { return nullptr; }); if (!NewRecipe) @@ -2116,8 +2125,9 @@ void VPlanTransforms::createInterleaveGroups( // zero. assert(IG->getIndex(IRInsertPos) != 0 && "index of insert position shouldn't be zero"); + auto &DL = IRInsertPos->getDataLayout(); APInt Offset(32, - getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 * + DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) * IG->getIndex(IRInsertPos), /*IsSigned=*/true); VPValue *OffsetVPV = Plan.getOrAddLiveIn( diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 983760451bf182508e10e7cf0a75173382b30a56..0011b6c8d2f0c131cb542228d367deb08f2bef9a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -141,6 +141,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { }; for (const VPUser *U : EVL.users()) { if (!TypeSwitch(U) + .Case( + [&](const VPWidenIntrinsicRecipe *S) { + return VerifyEVLUse(*S, S->getNumOperands() - 1); + }) .Case([&](const VPWidenStoreEVLRecipe *S) { return VerifyEVLUse(*S, 2); }) diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll index 91ad0bce3e536ac16b0e52d0388c771433557d4d..eb094b06ccb1683805224cec27e4e0cbee8fd19e 100644 --- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll @@ -9,11 +9,11 @@ define void @fadd() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fadd bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fadd <1 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fadd <2 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fadd <4 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fadd <8 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fadd <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fadd <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fadd <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fadd <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8BF16 = fadd <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16BF16 = fadd <16 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fadd undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fadd undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %NXV4BF16 = fadd undef, undef @@ -137,11 +137,11 @@ define void @fsub() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fsub <1 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fsub <2 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fsub <4 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fsub <8 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fsub <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fsub <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fsub <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fsub <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8BF16 = fsub <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16BF16 = fsub <16 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fsub undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fsub undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %NXV4BF16 = fsub undef, undef @@ -265,11 +265,11 @@ define void @fmul() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fmul bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fmul <1 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fmul <2 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fmul <4 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fmul <8 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fmul <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fmul <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fmul <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fmul <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8BF16 = fmul <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16BF16 = fmul <16 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fmul undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fmul undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %NXV4BF16 = fmul undef, undef @@ -393,11 +393,11 @@ define void @fdiv() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fdiv bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fdiv <1 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fdiv <2 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fdiv <4 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fdiv <8 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fdiv <16 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fdiv <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fdiv <2 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fdiv <4 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8BF16 = fdiv <8 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16BF16 = fdiv <16 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fdiv undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fdiv undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %NXV4BF16 = fdiv undef, undef @@ -526,11 +526,11 @@ define void @frem() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16BF16 = frem <16 x bfloat> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = frem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = frem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = frem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = frem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV16BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV2BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV4BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV8BF16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV16BF16 = frem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> undef, undef @@ -593,37 +593,21 @@ define void @frem() { } define void @frem_f16() { -; ZVFH-LABEL: 'frem_f16' -; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef -; ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef -; ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef -; ZVFH-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef -; ZVFH-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef -; ZVFH-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef -; ZVFH-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef -; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV1F16 = frem undef, undef -; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV2F16 = frem undef, undef -; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV4F16 = frem undef, undef -; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV8F16 = frem undef, undef -; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV16F16 = frem undef, undef -; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %NXV32F16 = frem undef, undef -; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; ZVFHMIN-LABEL: 'frem_f16' -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = frem undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = frem undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = frem undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = frem undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV16F16 = frem undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV32F16 = frem undef, undef -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-LABEL: 'frem_f16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV2F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV4F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV8F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV16F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV32F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %F16 = frem half undef, undef @@ -889,11 +873,11 @@ define void @fma() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1BF16 = call @llvm.fma.nxv1bf16( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2BF16 = call @llvm.fma.nxv2bf16( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4BF16 = call @llvm.fma.nxv4bf16( undef, undef, undef) @@ -1017,10 +1001,10 @@ define void @fmuladd() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call float @llvm.fmuladd.f32(float undef, float undef, float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call double @llvm.fmuladd.f64(double undef, double undef, double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %7 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll index 1f0bc5980c7fec5591d7ef2837a8ac603ec67b4f..bfa7d172f5229341e4ca333fe7733d655559df4b 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -660,16 +660,16 @@ define void @sext() { %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32> %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64> - %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16> %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32> @@ -682,16 +682,16 @@ define void @sext() { %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32> %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64> - %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16> %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32> @@ -704,16 +704,16 @@ define void @sext() { %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32> %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64> - %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16> %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32> @@ -726,16 +726,16 @@ define void @sext() { %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32> %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64> - %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16> %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32> @@ -748,16 +748,16 @@ define void @sext() { %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32> %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64> - %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16> %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32> @@ -770,16 +770,16 @@ define void @sext() { %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32> %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64> - %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16> %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32> @@ -792,16 +792,16 @@ define void @sext() { %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32> %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64> - %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16> %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32> @@ -814,16 +814,16 @@ define void @sext() { %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32> %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64> - %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef) - %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef) - %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef) - %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef) - %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef) - %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) %nxv1i8_nxv1i16 = sext undef to %nxv1i8_nxv1i32 = sext undef to @@ -836,16 +836,16 @@ define void @sext() { %nxv1i1_nxv1i32 = sext undef to %nxv1i1_nxv1i64 = sext undef to - %vp_nxv1i8_nxv1i16 = call @llvm.vp.sext.nxv1i8.nxv1i16( undef, undef, i32 undef) - %vp_nxv1i8_nxv1i32 = call @llvm.vp.sext.nxv1i8.nxv1i32( undef, undef, i32 undef) - %vp_nxv1i8_nxv1i64 = call @llvm.vp.sext.nxv1i8.nxv1i64( undef, undef, i32 undef) - %vp_nxv1i16_nxv1i32 = call @llvm.vp.sext.nxv1i16.nxv1i32( undef, undef, i32 undef) - %vp_nxv1i16_nxv1i64 = call @llvm.vp.sext.nxv1i16.nxv1i64( undef, undef, i32 undef) - %vp_nxv1i32_nxv1i64 = call @llvm.vp.sext.nxv1i32.nxv1i64( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i8 = call @llvm.vp.sext.nxv1i1.nxv1i8( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i16 = call @llvm.vp.sext.nxv1i1.nxv1i16( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i32 = call @llvm.vp.sext.nxv1i1.nxv1i32( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i64 = call @llvm.vp.sext.nxv1i1.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i16 = call @llvm.vp.sext.nxv1i16.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i8 = call @llvm.vp.sext.nxv1i8.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i16 = call @llvm.vp.sext.nxv1i16.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i32 = call @llvm.vp.sext.nxv1i32.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i64 = call @llvm.vp.sext.nxv1i64.nxv1i1( undef, undef, i32 undef) %nxv2i8_nxv2i16 = sext undef to %nxv2i8_nxv2i32 = sext undef to @@ -858,16 +858,16 @@ define void @sext() { %nxv2i1_nxv2i32 = sext undef to %nxv2i1_nxv2i64 = sext undef to - %vp_nxv2i8_nxv2i16 = call @llvm.vp.sext.nxv2i8.nxv2i16( undef, undef, i32 undef) - %vp_nxv2i8_nxv2i32 = call @llvm.vp.sext.nxv2i8.nxv2i32( undef, undef, i32 undef) - %vp_nxv2i8_nxv2i64 = call @llvm.vp.sext.nxv2i8.nxv2i64( undef, undef, i32 undef) - %vp_nxv2i16_nxv2i32 = call @llvm.vp.sext.nxv2i16.nxv2i32( undef, undef, i32 undef) - %vp_nxv2i16_nxv2i64 = call @llvm.vp.sext.nxv2i16.nxv2i64( undef, undef, i32 undef) - %vp_nxv2i32_nxv2i64 = call @llvm.vp.sext.nxv2i32.nxv2i64( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i8 = call @llvm.vp.sext.nxv2i1.nxv2i8( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i16 = call @llvm.vp.sext.nxv2i1.nxv2i16( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i32 = call @llvm.vp.sext.nxv2i1.nxv2i32( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i64 = call @llvm.vp.sext.nxv2i1.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i16 = call @llvm.vp.sext.nxv2i16.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i8 = call @llvm.vp.sext.nxv2i8.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i16 = call @llvm.vp.sext.nxv2i16.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i32 = call @llvm.vp.sext.nxv2i32.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i64 = call @llvm.vp.sext.nxv2i64.nxv2i1( undef, undef, i32 undef) %nxv4i8_nxv4i16 = sext undef to %nxv4i8_nxv4i32 = sext undef to @@ -880,16 +880,16 @@ define void @sext() { %nxv4i1_nxv4i32 = sext undef to %nxv4i1_nxv4i64 = sext undef to - %vp_nxv4i8_nxv4i16 = call @llvm.vp.sext.nxv4i8.nxv4i16( undef, undef, i32 undef) - %vp_nxv4i8_nxv4i32 = call @llvm.vp.sext.nxv4i8.nxv4i32( undef, undef, i32 undef) - %vp_nxv4i8_nxv4i64 = call @llvm.vp.sext.nxv4i8.nxv4i64( undef, undef, i32 undef) - %vp_nxv4i16_nxv4i32 = call @llvm.vp.sext.nxv4i16.nxv4i32( undef, undef, i32 undef) - %vp_nxv4i16_nxv4i64 = call @llvm.vp.sext.nxv4i16.nxv4i64( undef, undef, i32 undef) - %vp_nxv4i32_nxv4i64 = call @llvm.vp.sext.nxv4i32.nxv4i64( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i8 = call @llvm.vp.sext.nxv4i1.nxv4i8( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i16 = call @llvm.vp.sext.nxv4i1.nxv4i16( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i32 = call @llvm.vp.sext.nxv4i1.nxv4i32( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i64 = call @llvm.vp.sext.nxv4i1.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i16 = call @llvm.vp.sext.nxv4i16.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i8 = call @llvm.vp.sext.nxv4i8.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i16 = call @llvm.vp.sext.nxv4i16.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i32 = call @llvm.vp.sext.nxv4i32.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i64 = call @llvm.vp.sext.nxv4i64.nxv4i1( undef, undef, i32 undef) %nxv8i8_nxv8i16 = sext undef to %nxv8i8_nxv8i32 = sext undef to @@ -902,16 +902,16 @@ define void @sext() { %nxv8i1_nxv8i32 = sext undef to %nxv8i1_nxv8i64 = sext undef to - %vp_nxv8i8_nxv8i16 = call @llvm.vp.sext.nxv8i8.nxv8i16( undef, undef, i32 undef) - %vp_nxv8i8_nxv8i32 = call @llvm.vp.sext.nxv8i8.nxv8i32( undef, undef, i32 undef) - %vp_nxv8i8_nxv8i64 = call @llvm.vp.sext.nxv8i8.nxv8i64( undef, undef, i32 undef) - %vp_nxv8i16_nxv8i32 = call @llvm.vp.sext.nxv8i16.nxv8i32( undef, undef, i32 undef) - %vp_nxv8i16_nxv8i64 = call @llvm.vp.sext.nxv8i16.nxv8i64( undef, undef, i32 undef) - %vp_nxv8i32_nxv8i64 = call @llvm.vp.sext.nxv8i32.nxv8i64( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i8 = call @llvm.vp.sext.nxv8i1.nxv8i8( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i16 = call @llvm.vp.sext.nxv8i1.nxv8i16( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i32 = call @llvm.vp.sext.nxv8i1.nxv8i32( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i64 = call @llvm.vp.sext.nxv8i1.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i16 = call @llvm.vp.sext.nxv8i16.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i8 = call @llvm.vp.sext.nxv8i8.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i16 = call @llvm.vp.sext.nxv8i16.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i32 = call @llvm.vp.sext.nxv8i32.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i64 = call @llvm.vp.sext.nxv8i64.nxv8i1( undef, undef, i32 undef) %nxv16i8_nxv16i16 = sext undef to %nxv16i8_nxv16i32 = sext undef to @@ -924,16 +924,16 @@ define void @sext() { %nxv16i1_nxv16i32 = sext undef to %nxv16i1_nxv16i64 = sext undef to - %vp_nxv16i8_nxv16i16 = call @llvm.vp.sext.nxv16i8.nxv16i16( undef, undef, i32 undef) - %vp_nxv16i8_nxv16i32 = call @llvm.vp.sext.nxv16i8.nxv16i32( undef, undef, i32 undef) - %vp_nxv16i8_nxv16i64 = call @llvm.vp.sext.nxv16i8.nxv16i64( undef, undef, i32 undef) - %vp_nxv16i16_nxv16i32 = call @llvm.vp.sext.nxv16i16.nxv16i32( undef, undef, i32 undef) - %vp_nxv16i16_nxv16i64 = call @llvm.vp.sext.nxv16i16.nxv16i64( undef, undef, i32 undef) - %vp_nxv16i32_nxv16i64 = call @llvm.vp.sext.nxv16i32.nxv16i64( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i8 = call @llvm.vp.sext.nxv16i1.nxv16i8( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i16 = call @llvm.vp.sext.nxv16i1.nxv16i16( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i32 = call @llvm.vp.sext.nxv16i1.nxv16i32( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i1.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i8 = call @llvm.vp.sext.nxv16i8.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i16 = call @llvm.vp.sext.nxv16i16.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i32 = call @llvm.vp.sext.nxv16i32.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i64 = call @llvm.vp.sext.nxv16i64.nxv16i1( undef, undef, i32 undef) %nxv32i8_nxv32i16 = sext undef to %nxv32i8_nxv32i32 = sext undef to @@ -946,16 +946,16 @@ define void @sext() { %nxv32i1_nxv32i32 = sext undef to %nxv32i1_nxv32i64 = sext undef to - %vp_nxv32i8_nxv32i16 = call @llvm.vp.sext.nxv32i8.nxv32i16( undef, undef, i32 undef) - %vp_nxv32i8_nxv32i32 = call @llvm.vp.sext.nxv32i8.nxv32i32( undef, undef, i32 undef) - %vp_nxv32i8_nxv32i64 = call @llvm.vp.sext.nxv32i8.nxv32i64( undef, undef, i32 undef) - %vp_nxv32i16_nxv32i32 = call @llvm.vp.sext.nxv32i16.nxv32i32( undef, undef, i32 undef) - %vp_nxv32i16_nxv32i64 = call @llvm.vp.sext.nxv32i16.nxv32i64( undef, undef, i32 undef) - %vp_nxv32i32_nxv32i64 = call @llvm.vp.sext.nxv32i32.nxv32i64( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i8 = call @llvm.vp.sext.nxv32i1.nxv32i8( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i16 = call @llvm.vp.sext.nxv32i1.nxv32i16( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i1.nxv32i32( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i1.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i8 = call @llvm.vp.sext.nxv32i8.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i16 = call @llvm.vp.sext.nxv32i16.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i32 = call @llvm.vp.sext.nxv32i32.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i64 = call @llvm.vp.sext.nxv32i64.nxv32i1( undef, undef, i32 undef) %nxv64i8_nxv64i16 = sext undef to %nxv64i8_nxv64i32 = sext undef to @@ -968,16 +968,16 @@ define void @sext() { %nxv64i1_nxv64i32 = sext undef to %nxv64i1_nxv64i64 = sext undef to - %vp_nxv64i8_nxv64i16 = call @llvm.vp.sext.nxv64i8.nxv64i16( undef, undef, i32 undef) - %vp_nxv64i8_nxv64i32 = call @llvm.vp.sext.nxv64i8.nxv64i32( undef, undef, i32 undef) - %vp_nxv64i8_nxv64i64 = call @llvm.vp.sext.nxv64i8.nxv64i64( undef, undef, i32 undef) - %vp_nxv64i16_nxv64i32 = call @llvm.vp.sext.nxv64i16.nxv64i32( undef, undef, i32 undef) - %vp_nxv64i16_nxv64i64 = call @llvm.vp.sext.nxv64i16.nxv64i64( undef, undef, i32 undef) - %vp_nxv64i32_nxv64i64 = call @llvm.vp.sext.nxv64i32.nxv64i64( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i8 = call @llvm.vp.sext.nxv64i1.nxv64i8( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i1.nxv64i16( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i1.nxv64i32( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i1.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i8 = call @llvm.vp.sext.nxv64i8.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i16 = call @llvm.vp.sext.nxv64i16.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i32 = call @llvm.vp.sext.nxv64i32.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i64 = call @llvm.vp.sext.nxv64i64.nxv64i1( undef, undef, i32 undef) %nxv128i8_nxv128i16 = sext undef to %nxv128i8_nxv128i32 = sext undef to @@ -990,16 +990,16 @@ define void @sext() { %nxv128i1_nxv128i32 = sext undef to %nxv128i1_nxv128i128 = sext undef to - %vp_nxv128i8_nxv128i16 = call @llvm.vp.sext.nxv128i8.nxv128i16( undef, undef, i32 undef) - %vp_nxv128i8_nxv128i32 = call @llvm.vp.sext.nxv128i8.nxv128i32( undef, undef, i32 undef) - %vp_nxv128i8_nxv128i128 = call @llvm.vp.sext.nxv128i8.nxv128i128( undef, undef, i32 undef) - %vp_nxv128i16_nxv128i32 = call @llvm.vp.sext.nxv128i16.nxv128i32( undef, undef, i32 undef) - %vp_nxv128i16_nxv128i128 = call @llvm.vp.sext.nxv128i16.nxv128i128( undef, undef, i32 undef) - %vp_nxv128i32_nxv128i128 = call @llvm.vp.sext.nxv128i32.nxv128i128( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i1.nxv128i8( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i1.nxv128i16( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i1.nxv128i32( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i128 = call @llvm.vp.sext.nxv128i1.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i32_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i8 = call @llvm.vp.sext.nxv128i8.nxv128i1( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i16 = call @llvm.vp.sext.nxv128i16.nxv128i1( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i32 = call @llvm.vp.sext.nxv128i32.nxv128i1( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i128 = call @llvm.vp.sext.nxv128i128.nxv128i1( undef, undef, i32 undef) ret void } @@ -1662,16 +1662,16 @@ define void @zext() { %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32> %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64> - %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16> %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32> @@ -1684,16 +1684,16 @@ define void @zext() { %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32> %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64> - %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16> %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32> @@ -1706,16 +1706,16 @@ define void @zext() { %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32> %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64> - %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16> %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32> @@ -1728,16 +1728,16 @@ define void @zext() { %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32> %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64> - %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16> %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32> @@ -1750,16 +1750,16 @@ define void @zext() { %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32> %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64> - %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16> %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32> @@ -1772,16 +1772,16 @@ define void @zext() { %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32> %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64> - %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16> %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32> @@ -1794,16 +1794,16 @@ define void @zext() { %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32> %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64> - %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16> %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32> @@ -1816,16 +1816,16 @@ define void @zext() { %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32> %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64> - %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef) - %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef) - %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef) - %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef) - %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef) - %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef) - %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) + %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef) %nxv1i8_nxv1i16 = zext undef to %nxv1i8_nxv1i32 = zext undef to @@ -1838,16 +1838,16 @@ define void @zext() { %nxv1i1_nxv1i32 = zext undef to %nxv1i1_nxv1i64 = zext undef to - %vp_nxv1i8_nxv1i16 = call @llvm.vp.zext.nxv1i8.nxv1i16( undef, undef, i32 undef) - %vp_nxv1i8_nxv1i32 = call @llvm.vp.zext.nxv1i8.nxv1i32( undef, undef, i32 undef) - %vp_nxv1i8_nxv1i64 = call @llvm.vp.zext.nxv1i8.nxv1i64( undef, undef, i32 undef) - %vp_nxv1i16_nxv1i32 = call @llvm.vp.zext.nxv1i16.nxv1i32( undef, undef, i32 undef) - %vp_nxv1i16_nxv1i64 = call @llvm.vp.zext.nxv1i16.nxv1i64( undef, undef, i32 undef) - %vp_nxv1i32_nxv1i64 = call @llvm.vp.zext.nxv1i32.nxv1i64( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i8 = call @llvm.vp.zext.nxv1i1.nxv1i8( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i16 = call @llvm.vp.zext.nxv1i1.nxv1i16( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i32 = call @llvm.vp.zext.nxv1i1.nxv1i32( undef, undef, i32 undef) - %vp_nxv1i1_nxv1i64 = call @llvm.vp.zext.nxv1i1.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i16 = call @llvm.vp.zext.nxv1i16.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i8 = call @llvm.vp.zext.nxv1i8.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i16 = call @llvm.vp.zext.nxv1i16.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i32 = call @llvm.vp.zext.nxv1i32.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i1_nxv1i64 = call @llvm.vp.zext.nxv1i64.nxv1i1( undef, undef, i32 undef) %nxv2i8_nxv2i16 = zext undef to %nxv2i8_nxv2i32 = zext undef to @@ -1860,16 +1860,16 @@ define void @zext() { %nxv2i1_nxv2i32 = zext undef to %nxv2i1_nxv2i64 = zext undef to - %vp_nxv2i8_nxv2i16 = call @llvm.vp.zext.nxv2i8.nxv2i16( undef, undef, i32 undef) - %vp_nxv2i8_nxv2i32 = call @llvm.vp.zext.nxv2i8.nxv2i32( undef, undef, i32 undef) - %vp_nxv2i8_nxv2i64 = call @llvm.vp.zext.nxv2i8.nxv2i64( undef, undef, i32 undef) - %vp_nxv2i16_nxv2i32 = call @llvm.vp.zext.nxv2i16.nxv2i32( undef, undef, i32 undef) - %vp_nxv2i16_nxv2i64 = call @llvm.vp.zext.nxv2i16.nxv2i64( undef, undef, i32 undef) - %vp_nxv2i32_nxv2i64 = call @llvm.vp.zext.nxv2i32.nxv2i64( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i8 = call @llvm.vp.zext.nxv2i1.nxv2i8( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i16 = call @llvm.vp.zext.nxv2i1.nxv2i16( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i32 = call @llvm.vp.zext.nxv2i1.nxv2i32( undef, undef, i32 undef) - %vp_nxv2i1_nxv2i64 = call @llvm.vp.zext.nxv2i1.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i16 = call @llvm.vp.zext.nxv2i16.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i8 = call @llvm.vp.zext.nxv2i8.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i16 = call @llvm.vp.zext.nxv2i16.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i32 = call @llvm.vp.zext.nxv2i32.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i1_nxv2i64 = call @llvm.vp.zext.nxv2i64.nxv2i1( undef, undef, i32 undef) %nxv4i8_nxv4i16 = zext undef to %nxv4i8_nxv4i32 = zext undef to @@ -1882,16 +1882,16 @@ define void @zext() { %nxv4i1_nxv4i32 = zext undef to %nxv4i1_nxv4i64 = zext undef to - %vp_nxv4i8_nxv4i16 = call @llvm.vp.zext.nxv4i8.nxv4i16( undef, undef, i32 undef) - %vp_nxv4i8_nxv4i32 = call @llvm.vp.zext.nxv4i8.nxv4i32( undef, undef, i32 undef) - %vp_nxv4i8_nxv4i64 = call @llvm.vp.zext.nxv4i8.nxv4i64( undef, undef, i32 undef) - %vp_nxv4i16_nxv4i32 = call @llvm.vp.zext.nxv4i16.nxv4i32( undef, undef, i32 undef) - %vp_nxv4i16_nxv4i64 = call @llvm.vp.zext.nxv4i16.nxv4i64( undef, undef, i32 undef) - %vp_nxv4i32_nxv4i64 = call @llvm.vp.zext.nxv4i32.nxv4i64( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i8 = call @llvm.vp.zext.nxv4i1.nxv4i8( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i16 = call @llvm.vp.zext.nxv4i1.nxv4i16( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i32 = call @llvm.vp.zext.nxv4i1.nxv4i32( undef, undef, i32 undef) - %vp_nxv4i1_nxv4i64 = call @llvm.vp.zext.nxv4i1.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i16 = call @llvm.vp.zext.nxv4i16.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i8 = call @llvm.vp.zext.nxv4i8.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i16 = call @llvm.vp.zext.nxv4i16.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i32 = call @llvm.vp.zext.nxv4i32.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i1_nxv4i64 = call @llvm.vp.zext.nxv4i64.nxv4i1( undef, undef, i32 undef) %nxv8i8_nxv8i16 = zext undef to %nxv8i8_nxv8i32 = zext undef to @@ -1904,16 +1904,16 @@ define void @zext() { %nxv8i1_nxv8i32 = zext undef to %nxv8i1_nxv8i64 = zext undef to - %vp_nxv8i8_nxv8i16 = call @llvm.vp.zext.nxv8i8.nxv8i16( undef, undef, i32 undef) - %vp_nxv8i8_nxv8i32 = call @llvm.vp.zext.nxv8i8.nxv8i32( undef, undef, i32 undef) - %vp_nxv8i8_nxv8i64 = call @llvm.vp.zext.nxv8i8.nxv8i64( undef, undef, i32 undef) - %vp_nxv8i16_nxv8i32 = call @llvm.vp.zext.nxv8i16.nxv8i32( undef, undef, i32 undef) - %vp_nxv8i16_nxv8i64 = call @llvm.vp.zext.nxv8i16.nxv8i64( undef, undef, i32 undef) - %vp_nxv8i32_nxv8i64 = call @llvm.vp.zext.nxv8i32.nxv8i64( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i8 = call @llvm.vp.zext.nxv8i1.nxv8i8( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i16 = call @llvm.vp.zext.nxv8i1.nxv8i16( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i32 = call @llvm.vp.zext.nxv8i1.nxv8i32( undef, undef, i32 undef) - %vp_nxv8i1_nxv8i64 = call @llvm.vp.zext.nxv8i1.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i16 = call @llvm.vp.zext.nxv8i16.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i8 = call @llvm.vp.zext.nxv8i8.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i16 = call @llvm.vp.zext.nxv8i16.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i32 = call @llvm.vp.zext.nxv8i32.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i1_nxv8i64 = call @llvm.vp.zext.nxv8i64.nxv8i1( undef, undef, i32 undef) %nxv16i8_nxv16i16 = zext undef to %nxv16i8_nxv16i32 = zext undef to @@ -1926,16 +1926,16 @@ define void @zext() { %nxv16i1_nxv16i32 = zext undef to %nxv16i1_nxv16i64 = zext undef to - %vp_nxv16i8_nxv16i16 = call @llvm.vp.zext.nxv16i8.nxv16i16( undef, undef, i32 undef) - %vp_nxv16i8_nxv16i32 = call @llvm.vp.zext.nxv16i8.nxv16i32( undef, undef, i32 undef) - %vp_nxv16i8_nxv16i64 = call @llvm.vp.zext.nxv16i8.nxv16i64( undef, undef, i32 undef) - %vp_nxv16i16_nxv16i32 = call @llvm.vp.zext.nxv16i16.nxv16i32( undef, undef, i32 undef) - %vp_nxv16i16_nxv16i64 = call @llvm.vp.zext.nxv16i16.nxv16i64( undef, undef, i32 undef) - %vp_nxv16i32_nxv16i64 = call @llvm.vp.zext.nxv16i32.nxv16i64( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i8 = call @llvm.vp.zext.nxv16i1.nxv16i8( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i16 = call @llvm.vp.zext.nxv16i1.nxv16i16( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i32 = call @llvm.vp.zext.nxv16i1.nxv16i32( undef, undef, i32 undef) - %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i1.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i8 = call @llvm.vp.zext.nxv16i8.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i16 = call @llvm.vp.zext.nxv16i16.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i32 = call @llvm.vp.zext.nxv16i32.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i1_nxv16i64 = call @llvm.vp.zext.nxv16i64.nxv16i1( undef, undef, i32 undef) %nxv32i8_nxv32i16 = zext undef to %nxv32i8_nxv32i32 = zext undef to @@ -1948,16 +1948,16 @@ define void @zext() { %nxv32i1_nxv32i32 = zext undef to %nxv32i1_nxv32i64 = zext undef to - %vp_nxv32i8_nxv32i16 = call @llvm.vp.zext.nxv32i8.nxv32i16( undef, undef, i32 undef) - %vp_nxv32i8_nxv32i32 = call @llvm.vp.zext.nxv32i8.nxv32i32( undef, undef, i32 undef) - %vp_nxv32i8_nxv32i64 = call @llvm.vp.zext.nxv32i8.nxv32i64( undef, undef, i32 undef) - %vp_nxv32i16_nxv32i32 = call @llvm.vp.zext.nxv32i16.nxv32i32( undef, undef, i32 undef) - %vp_nxv32i16_nxv32i64 = call @llvm.vp.zext.nxv32i16.nxv32i64( undef, undef, i32 undef) - %vp_nxv32i32_nxv32i64 = call @llvm.vp.zext.nxv32i32.nxv32i64( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i8 = call @llvm.vp.zext.nxv32i1.nxv32i8( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i16 = call @llvm.vp.zext.nxv32i1.nxv32i16( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i1.nxv32i32( undef, undef, i32 undef) - %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i1.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i8 = call @llvm.vp.zext.nxv32i8.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i16 = call @llvm.vp.zext.nxv32i16.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i32 = call @llvm.vp.zext.nxv32i32.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i1_nxv32i64 = call @llvm.vp.zext.nxv32i64.nxv32i1( undef, undef, i32 undef) %nxv64i8_nxv64i16 = zext undef to %nxv64i8_nxv64i32 = zext undef to @@ -1970,16 +1970,16 @@ define void @zext() { %nxv64i1_nxv64i32 = zext undef to %nxv64i1_nxv64i64 = zext undef to - %vp_nxv64i8_nxv64i16 = call @llvm.vp.zext.nxv64i8.nxv64i16( undef, undef, i32 undef) - %vp_nxv64i8_nxv64i32 = call @llvm.vp.zext.nxv64i8.nxv64i32( undef, undef, i32 undef) - %vp_nxv64i8_nxv64i64 = call @llvm.vp.zext.nxv64i8.nxv64i64( undef, undef, i32 undef) - %vp_nxv64i16_nxv64i32 = call @llvm.vp.zext.nxv64i16.nxv64i32( undef, undef, i32 undef) - %vp_nxv64i16_nxv64i64 = call @llvm.vp.zext.nxv64i16.nxv64i64( undef, undef, i32 undef) - %vp_nxv64i32_nxv64i64 = call @llvm.vp.zext.nxv64i32.nxv64i64( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i8 = call @llvm.vp.zext.nxv64i1.nxv64i8( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i1.nxv64i16( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i1.nxv64i32( undef, undef, i32 undef) - %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i1.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i8 = call @llvm.vp.zext.nxv64i8.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i16 = call @llvm.vp.zext.nxv64i16.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i32 = call @llvm.vp.zext.nxv64i32.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i1_nxv64i64 = call @llvm.vp.zext.nxv64i64.nxv64i1( undef, undef, i32 undef) %nxv128i8_nxv128i16 = zext undef to %nxv128i8_nxv128i32 = zext undef to @@ -1992,16 +1992,16 @@ define void @zext() { %nxv128i1_nxv128i32 = zext undef to %nxv128i1_nxv128i128 = zext undef to - %vp_nxv128i8_nxv128i16 = call @llvm.vp.zext.nxv128i8.nxv128i16( undef, undef, i32 undef) - %vp_nxv128i8_nxv128i32 = call @llvm.vp.zext.nxv128i8.nxv128i32( undef, undef, i32 undef) - %vp_nxv128i8_nxv128i128 = call @llvm.vp.zext.nxv128i8.nxv128i128( undef, undef, i32 undef) - %vp_nxv128i16_nxv128i32 = call @llvm.vp.zext.nxv128i16.nxv128i32( undef, undef, i32 undef) - %vp_nxv128i16_nxv128i128 = call @llvm.vp.zext.nxv128i16.nxv128i128( undef, undef, i32 undef) - %vp_nxv128i32_nxv128i128 = call @llvm.vp.zext.nxv128i32.nxv128i128( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i1.nxv128i8( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i1.nxv128i16( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i1.nxv128i32( undef, undef, i32 undef) - %vp_nxv128i1_nxv128i128 = call @llvm.vp.zext.nxv128i1.nxv128i128( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i8_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i8( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i16_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i16( undef, undef, i32 undef) + %vp_nxv128i32_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i32( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i8 = call @llvm.vp.zext.nxv128i8.nxv128i1( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i16 = call @llvm.vp.zext.nxv128i16.nxv128i1( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i32 = call @llvm.vp.zext.nxv128i32.nxv128i1( undef, undef, i32 undef) + %vp_nxv128i1_nxv128i128 = call @llvm.vp.zext.nxv128i128.nxv128i1( undef, undef, i32 undef) ret void } @@ -2640,11 +2640,11 @@ define void @trunc() { %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2> %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4> - %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i16.v2i2(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i16.v2i4(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i16.v2i6(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i4.v2i2(<2 x i4> undef, <2 x i1> undef, i32 undef) - %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i6.v2i4(<2 x i6> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef) + %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef) %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8> %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8> @@ -2657,16 +2657,16 @@ define void @trunc() { %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1> %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1> - %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i16.v2i8(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i32.v2i8(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i64.v2i8(<2 x i64> undef, <2 x i1> undef, i32 undef) - %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i32.v2i16(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i64.v2i16(<2 x i64> undef, <2 x i1> undef, i32 undef) - %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i64.v2i32(<2 x i64> undef, <2 x i1> undef, i32 undef) - %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i8.v2i1(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i16.v2i1(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i32.v2i1(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i64.v2i1(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8> %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8> @@ -2679,16 +2679,16 @@ define void @trunc() { %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1> %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1> - %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i16.v4i8(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i32.v4i8(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i64.v4i8(<4 x i64> undef, <4 x i1> undef, i32 undef) - %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i32.v4i16(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i64.v4i16(<4 x i64> undef, <4 x i1> undef, i32 undef) - %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i64.v4i32(<4 x i64> undef, <4 x i1> undef, i32 undef) - %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i8.v4i1(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i16.v4i1(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i32.v4i1(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i64.v4i1(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8> %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8> @@ -2701,16 +2701,16 @@ define void @trunc() { %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1> %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1> - %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i16.v8i8(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i32.v8i8(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i64.v8i8(<8 x i64> undef, <8 x i1> undef, i32 undef) - %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i32.v8i16(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i64.v8i16(<8 x i64> undef, <8 x i1> undef, i32 undef) - %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i64.v8i32(<8 x i64> undef, <8 x i1> undef, i32 undef) - %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i8.v8i1(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i16.v8i1(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i32.v8i1(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i64.v8i1(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8> %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8> @@ -2723,16 +2723,16 @@ define void @trunc() { %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1> %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1> - %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i16.v16i8(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i32.v16i8(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i64.v16i8(<16 x i64> undef, <16 x i1> undef, i32 undef) - %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i32.v16i16(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i64.v16i16(<16 x i64> undef, <16 x i1> undef, i32 undef) - %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i64.v16i32(<16 x i64> undef, <16 x i1> undef, i32 undef) - %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i8.v16i1(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i16.v16i1(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i32.v16i1(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i64.v16i1(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8> %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8> @@ -2745,16 +2745,16 @@ define void @trunc() { %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1> %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1> - %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i16.v32i8(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i32.v32i8(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i64.v32i8(<32 x i64> undef, <32 x i1> undef, i32 undef) - %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i32.v32i16(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i64.v32i16(<32 x i64> undef, <32 x i1> undef, i32 undef) - %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i64.v32i32(<32 x i64> undef, <32 x i1> undef, i32 undef) - %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i8.v32i1(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i16.v32i1(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i32.v32i1(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i64.v32i1(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8> %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8> @@ -2767,16 +2767,16 @@ define void @trunc() { %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1> %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1> - %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i16.v64i8(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i32.v64i8(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i64.v64i8(<64 x i64> undef, <64 x i1> undef, i32 undef) - %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i32.v64i16(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i64.v64i16(<64 x i64> undef, <64 x i1> undef, i32 undef) - %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i64.v64i32(<64 x i64> undef, <64 x i1> undef, i32 undef) - %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i8.v64i1(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i16.v64i1(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i32.v64i1(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i64.v64i1(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8> %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8> @@ -2789,16 +2789,16 @@ define void @trunc() { %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1> %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1> - %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i16.v128i8(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i32.v128i8(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i64.v128i8(<128 x i64> undef, <128 x i1> undef, i32 undef) - %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i32.v128i16(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i64.v128i16(<128 x i64> undef, <128 x i1> undef, i32 undef) - %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i64.v128i32(<128 x i64> undef, <128 x i1> undef, i32 undef) - %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i8.v128i1(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i16.v128i1(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i32.v128i1(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i64.v128i1(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8> %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8> @@ -2811,16 +2811,16 @@ define void @trunc() { %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1> %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1> - %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i16.v256i8(<256 x i16> undef, <256 x i1> undef, i32 undef) - %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i32.v256i8(<256 x i32> undef, <256 x i1> undef, i32 undef) - %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i64.v256i8(<256 x i64> undef, <256 x i1> undef, i32 undef) - %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i32.v256i16(<256 x i32> undef, <256 x i1> undef, i32 undef) - %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i64.v256i16(<256 x i64> undef, <256 x i1> undef, i32 undef) - %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i64.v256i32(<256 x i64> undef, <256 x i1> undef, i32 undef) - %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i8.v256i1(<256 x i8> undef, <256 x i1> undef, i32 undef) - %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i16.v256i1(<256 x i16> undef, <256 x i1> undef, i32 undef) - %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i32.v256i1(<256 x i32> undef, <256 x i1> undef, i32 undef) - %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i64.v256i1(<256 x i64> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) + %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef) + %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef) + %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef) + %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef) %nxv1i16_nxv1i8 = trunc undef to %nxv1i32_nxv1i8 = trunc undef to @@ -2833,16 +2833,16 @@ define void @trunc() { %nxv1i32_nxv1i1 = trunc undef to %nxv1i64_nxv1i1 = trunc undef to - %vp_nxv1i16_nxv1i8 = call @llvm.vp.trunc.nxv1i16.nxv1i8( undef, undef, i32 undef) - %vp_nxv1i32_nxv1i8 = call @llvm.vp.trunc.nxv1i32.nxv1i8( undef, undef, i32 undef) - %vp_nxv1i64_nxv1i8 = call @llvm.vp.trunc.nxv1i64.nxv1i8( undef, undef, i32 undef) - %vp_nxv1i32_nxv1i16 = call @llvm.vp.trunc.nxv1i32.nxv1i16( undef, undef, i32 undef) - %vp_nxv1i64_nxv1i16 = call @llvm.vp.trunc.nxv1i64.nxv1i16( undef, undef, i32 undef) - %vp_nxv1i64_nxv1i32 = call @llvm.vp.trunc.nxv1i64.nxv1i32( undef, undef, i32 undef) - %vp_nxv1i8_nxv1i1 = call @llvm.vp.trunc.nxv1i8.nxv1i1( undef, undef, i32 undef) - %vp_nxv1i16_nxv1i1 = call @llvm.vp.trunc.nxv1i16.nxv1i1( undef, undef, i32 undef) - %vp_nxv1i32_nxv1i1 = call @llvm.vp.trunc.nxv1i32.nxv1i1( undef, undef, i32 undef) - %vp_nxv1i64_nxv1i1 = call @llvm.vp.trunc.nxv1i64.nxv1i1( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i8 = call @llvm.vp.trunc.nxv1i8.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i16 = call @llvm.vp.trunc.nxv1i16.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i16 = call @llvm.vp.trunc.nxv1i16.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i32 = call @llvm.vp.trunc.nxv1i32.nxv1i64( undef, undef, i32 undef) + %vp_nxv1i8_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i8( undef, undef, i32 undef) + %vp_nxv1i16_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i16( undef, undef, i32 undef) + %vp_nxv1i32_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i32( undef, undef, i32 undef) + %vp_nxv1i64_nxv1i1 = call @llvm.vp.trunc.nxv1i1.nxv1i64( undef, undef, i32 undef) %nxv2i16_nxv2i8 = trunc undef to %nxv2i32_nxv2i8 = trunc undef to @@ -2855,16 +2855,16 @@ define void @trunc() { %nxv2i32_nxv2i1 = trunc undef to %nxv2i64_nxv2i1 = trunc undef to - %vp_nxv2i16_nxv2i8 = call @llvm.vp.trunc.nxv2i16.nxv2i8( undef, undef, i32 undef) - %vp_nxv2i32_nxv2i8 = call @llvm.vp.trunc.nxv2i32.nxv2i8( undef, undef, i32 undef) - %vp_nxv2i64_nxv2i8 = call @llvm.vp.trunc.nxv2i64.nxv2i8( undef, undef, i32 undef) - %vp_nxv2i32_nxv2i16 = call @llvm.vp.trunc.nxv2i32.nxv2i16( undef, undef, i32 undef) - %vp_nxv2i64_nxv2i16 = call @llvm.vp.trunc.nxv2i64.nxv2i16( undef, undef, i32 undef) - %vp_nxv2i64_nxv2i32 = call @llvm.vp.trunc.nxv2i64.nxv2i32( undef, undef, i32 undef) - %vp_nxv2i8_nxv2i1 = call @llvm.vp.trunc.nxv2i8.nxv2i1( undef, undef, i32 undef) - %vp_nxv2i16_nxv2i1 = call @llvm.vp.trunc.nxv2i16.nxv2i1( undef, undef, i32 undef) - %vp_nxv2i32_nxv2i1 = call @llvm.vp.trunc.nxv2i32.nxv2i1( undef, undef, i32 undef) - %vp_nxv2i64_nxv2i1 = call @llvm.vp.trunc.nxv2i64.nxv2i1( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i8 = call @llvm.vp.trunc.nxv2i8.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i16 = call @llvm.vp.trunc.nxv2i16.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i16 = call @llvm.vp.trunc.nxv2i16.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i32 = call @llvm.vp.trunc.nxv2i32.nxv2i64( undef, undef, i32 undef) + %vp_nxv2i8_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i8( undef, undef, i32 undef) + %vp_nxv2i16_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i16( undef, undef, i32 undef) + %vp_nxv2i32_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i32( undef, undef, i32 undef) + %vp_nxv2i64_nxv2i1 = call @llvm.vp.trunc.nxv2i1.nxv2i64( undef, undef, i32 undef) %nxv4i16_nxv4i8 = trunc undef to %nxv4i32_nxv4i8 = trunc undef to @@ -2877,16 +2877,16 @@ define void @trunc() { %nxv4i32_nxv4i1 = trunc undef to %nxv4i64_nxv4i1 = trunc undef to - %vp_nxv4i16_nxv4i8 = call @llvm.vp.trunc.nxv4i16.nxv4i8( undef, undef, i32 undef) - %vp_nxv4i32_nxv4i8 = call @llvm.vp.trunc.nxv4i32.nxv4i8( undef, undef, i32 undef) - %vp_nxv4i64_nxv4i8 = call @llvm.vp.trunc.nxv4i64.nxv4i8( undef, undef, i32 undef) - %vp_nxv4i32_nxv4i16 = call @llvm.vp.trunc.nxv4i32.nxv4i16( undef, undef, i32 undef) - %vp_nxv4i64_nxv4i16 = call @llvm.vp.trunc.nxv4i64.nxv4i16( undef, undef, i32 undef) - %vp_nxv4i64_nxv4i32 = call @llvm.vp.trunc.nxv4i64.nxv4i32( undef, undef, i32 undef) - %vp_nxv4i8_nxv4i1 = call @llvm.vp.trunc.nxv4i8.nxv4i1( undef, undef, i32 undef) - %vp_nxv4i16_nxv4i1 = call @llvm.vp.trunc.nxv4i16.nxv4i1( undef, undef, i32 undef) - %vp_nxv4i32_nxv4i1 = call @llvm.vp.trunc.nxv4i32.nxv4i1( undef, undef, i32 undef) - %vp_nxv4i64_nxv4i1 = call @llvm.vp.trunc.nxv4i64.nxv4i1( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i8 = call @llvm.vp.trunc.nxv4i8.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i16 = call @llvm.vp.trunc.nxv4i16.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i16 = call @llvm.vp.trunc.nxv4i16.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i32 = call @llvm.vp.trunc.nxv4i32.nxv4i64( undef, undef, i32 undef) + %vp_nxv4i8_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i8( undef, undef, i32 undef) + %vp_nxv4i16_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i16( undef, undef, i32 undef) + %vp_nxv4i32_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i32( undef, undef, i32 undef) + %vp_nxv4i64_nxv4i1 = call @llvm.vp.trunc.nxv4i1.nxv4i64( undef, undef, i32 undef) %nxv8i16_nxv8i8 = trunc undef to %nxv8i32_nxv8i8 = trunc undef to @@ -2899,16 +2899,16 @@ define void @trunc() { %nxv8i32_nxv8i1 = trunc undef to %nxv8i64_nxv8i1 = trunc undef to - %vp_nxv8i16_nxv8i8 = call @llvm.vp.trunc.nxv8i16.nxv8i8( undef, undef, i32 undef) - %vp_nxv8i32_nxv8i8 = call @llvm.vp.trunc.nxv8i32.nxv8i8( undef, undef, i32 undef) - %vp_nxv8i64_nxv8i8 = call @llvm.vp.trunc.nxv8i64.nxv8i8( undef, undef, i32 undef) - %vp_nxv8i32_nxv8i16 = call @llvm.vp.trunc.nxv8i32.nxv8i16( undef, undef, i32 undef) - %vp_nxv8i64_nxv8i16 = call @llvm.vp.trunc.nxv8i64.nxv8i16( undef, undef, i32 undef) - %vp_nxv8i64_nxv8i32 = call @llvm.vp.trunc.nxv8i64.nxv8i32( undef, undef, i32 undef) - %vp_nxv8i8_nxv8i1 = call @llvm.vp.trunc.nxv8i8.nxv8i1( undef, undef, i32 undef) - %vp_nxv8i16_nxv8i1 = call @llvm.vp.trunc.nxv8i16.nxv8i1( undef, undef, i32 undef) - %vp_nxv8i32_nxv8i1 = call @llvm.vp.trunc.nxv8i32.nxv8i1( undef, undef, i32 undef) - %vp_nxv8i64_nxv8i1 = call @llvm.vp.trunc.nxv8i64.nxv8i1( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i8 = call @llvm.vp.trunc.nxv8i8.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i16 = call @llvm.vp.trunc.nxv8i16.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i16 = call @llvm.vp.trunc.nxv8i16.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i32 = call @llvm.vp.trunc.nxv8i32.nxv8i64( undef, undef, i32 undef) + %vp_nxv8i8_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i8( undef, undef, i32 undef) + %vp_nxv8i16_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i16( undef, undef, i32 undef) + %vp_nxv8i32_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i32( undef, undef, i32 undef) + %vp_nxv8i64_nxv8i1 = call @llvm.vp.trunc.nxv8i1.nxv8i64( undef, undef, i32 undef) %nxv16i16_nxv16i8 = trunc undef to %nxv16i32_nxv16i8 = trunc undef to @@ -2921,16 +2921,16 @@ define void @trunc() { %nxv16i32_nxv16i1 = trunc undef to %nxv16i64_nxv16i1 = trunc undef to - %vp_nxv16i16_nxv16i8 = call @llvm.vp.trunc.nxv16i16.nxv16i8( undef, undef, i32 undef) - %vp_nxv16i32_nxv16i8 = call @llvm.vp.trunc.nxv16i32.nxv16i8( undef, undef, i32 undef) - %vp_nxv16i64_nxv16i8 = call @llvm.vp.trunc.nxv16i64.nxv16i8( undef, undef, i32 undef) - %vp_nxv16i32_nxv16i16 = call @llvm.vp.trunc.nxv16i32.nxv16i16( undef, undef, i32 undef) - %vp_nxv16i64_nxv16i16 = call @llvm.vp.trunc.nxv16i64.nxv16i16( undef, undef, i32 undef) - %vp_nxv16i64_nxv16i32 = call @llvm.vp.trunc.nxv16i64.nxv16i32( undef, undef, i32 undef) - %vp_nxv16i8_nxv16i1 = call @llvm.vp.trunc.nxv16i8.nxv16i1( undef, undef, i32 undef) - %vp_nxv16i16_nxv16i1 = call @llvm.vp.trunc.nxv16i16.nxv16i1( undef, undef, i32 undef) - %vp_nxv16i32_nxv16i1 = call @llvm.vp.trunc.nxv16i32.nxv16i1( undef, undef, i32 undef) - %vp_nxv16i64_nxv16i1 = call @llvm.vp.trunc.nxv16i64.nxv16i1( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i8 = call @llvm.vp.trunc.nxv16i8.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i16 = call @llvm.vp.trunc.nxv16i16.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i16 = call @llvm.vp.trunc.nxv16i16.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i32 = call @llvm.vp.trunc.nxv16i32.nxv16i64( undef, undef, i32 undef) + %vp_nxv16i8_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i8( undef, undef, i32 undef) + %vp_nxv16i16_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i16( undef, undef, i32 undef) + %vp_nxv16i32_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i32( undef, undef, i32 undef) + %vp_nxv16i64_nxv16i1 = call @llvm.vp.trunc.nxv16i1.nxv16i64( undef, undef, i32 undef) %nxv32i16_nxv32i8 = trunc undef to %nxv32i32_nxv32i8 = trunc undef to @@ -2943,16 +2943,16 @@ define void @trunc() { %nxv32i32_nxv32i1 = trunc undef to %nxv32i64_nxv32i1 = trunc undef to - %vp_nxv32i16_nxv32i8 = call @llvm.vp.trunc.nxv32i16.nxv32i8( undef, undef, i32 undef) - %vp_nxv32i32_nxv32i8 = call @llvm.vp.trunc.nxv32i32.nxv32i8( undef, undef, i32 undef) - %vp_nxv32i64_nxv32i8 = call @llvm.vp.trunc.nxv32i64.nxv32i8( undef, undef, i32 undef) - %vp_nxv32i32_nxv32i16 = call @llvm.vp.trunc.nxv32i32.nxv32i16( undef, undef, i32 undef) - %vp_nxv32i64_nxv32i16 = call @llvm.vp.trunc.nxv32i64.nxv32i16( undef, undef, i32 undef) - %vp_nxv32i64_nxv32i32 = call @llvm.vp.trunc.nxv32i64.nxv32i32( undef, undef, i32 undef) - %vp_nxv32i8_nxv32i1 = call @llvm.vp.trunc.nxv32i8.nxv32i1( undef, undef, i32 undef) - %vp_nxv32i16_nxv32i1 = call @llvm.vp.trunc.nxv32i16.nxv32i1( undef, undef, i32 undef) - %vp_nxv32i32_nxv32i1 = call @llvm.vp.trunc.nxv32i32.nxv32i1( undef, undef, i32 undef) - %vp_nxv32i64_nxv32i1 = call @llvm.vp.trunc.nxv32i64.nxv32i1( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i8 = call @llvm.vp.trunc.nxv32i8.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i16 = call @llvm.vp.trunc.nxv32i16.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i16 = call @llvm.vp.trunc.nxv32i16.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i32 = call @llvm.vp.trunc.nxv32i32.nxv32i64( undef, undef, i32 undef) + %vp_nxv32i8_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i8( undef, undef, i32 undef) + %vp_nxv32i16_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i16( undef, undef, i32 undef) + %vp_nxv32i32_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i32( undef, undef, i32 undef) + %vp_nxv32i64_nxv32i1 = call @llvm.vp.trunc.nxv32i1.nxv32i64( undef, undef, i32 undef) %nxv64i16_nxv64i8 = trunc undef to %nxv64i32_nxv64i8 = trunc undef to @@ -2965,16 +2965,16 @@ define void @trunc() { %nxv64i32_nxv64i1 = trunc undef to %nxv64i64_nxv64i1 = trunc undef to - %vp_nxv64i16_nxv64i8 = call @llvm.vp.trunc.nxv64i16.nxv64i8( undef, undef, i32 undef) - %vp_nxv64i32_nxv64i8 = call @llvm.vp.trunc.nxv64i32.nxv64i8( undef, undef, i32 undef) - %vp_nxv64i64_nxv64i8 = call @llvm.vp.trunc.nxv64i64.nxv64i8( undef, undef, i32 undef) - %vp_nxv64i32_nxv64i16 = call @llvm.vp.trunc.nxv64i32.nxv64i16( undef, undef, i32 undef) - %vp_nxv64i64_nxv64i16 = call @llvm.vp.trunc.nxv64i64.nxv64i16( undef, undef, i32 undef) - %vp_nxv64i64_nxv64i32 = call @llvm.vp.trunc.nxv64i64.nxv64i32( undef, undef, i32 undef) - %vp_nxv64i8_nxv64i1 = call @llvm.vp.trunc.nxv64i8.nxv64i1( undef, undef, i32 undef) - %vp_nxv64i16_nxv64i1 = call @llvm.vp.trunc.nxv64i16.nxv64i1( undef, undef, i32 undef) - %vp_nxv64i32_nxv64i1 = call @llvm.vp.trunc.nxv64i32.nxv64i1( undef, undef, i32 undef) - %vp_nxv64i64_nxv64i1 = call @llvm.vp.trunc.nxv64i64.nxv64i1( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i8 = call @llvm.vp.trunc.nxv64i8.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i16 = call @llvm.vp.trunc.nxv64i16.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i16 = call @llvm.vp.trunc.nxv64i16.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i32 = call @llvm.vp.trunc.nxv64i32.nxv64i64( undef, undef, i32 undef) + %vp_nxv64i8_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i8( undef, undef, i32 undef) + %vp_nxv64i16_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i16( undef, undef, i32 undef) + %vp_nxv64i32_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i32( undef, undef, i32 undef) + %vp_nxv64i64_nxv64i1 = call @llvm.vp.trunc.nxv64i1.nxv64i64( undef, undef, i32 undef) ret void } @@ -3071,113 +3071,113 @@ define void @fpext() { %v2f16_v2f64 = fpext <2 x half> undef to <2 x double> %v2f32_v2f64 = fpext <2 x float> undef to <2 x double> - %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2half.v2float(<2 x half> undef, <2 x i1> undef, i32 undef) - %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2half.v2double(<2 x half> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2float.v2double(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef) + %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) %v4f16_v4f32 = fpext <4 x half> undef to <4 x float> %v4f16_v4f64 = fpext <4 x half> undef to <4 x double> %v4f32_v4f64 = fpext <4 x float> undef to <4 x double> - %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4half.v4float(<4 x half> undef, <4 x i1> undef, i32 undef) - %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4half.v4double(<4 x half> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4float.v4double(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4f32.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) + %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) %v8f16_v8f32 = fpext <8 x half> undef to <8 x float> %v8f16_v8f64 = fpext <8 x half> undef to <8 x double> %v8f32_v8f64 = fpext <8 x float> undef to <8 x double> - %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8half.v8float(<8 x half> undef, <8 x i1> undef, i32 undef) - %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8half.v8double(<8 x half> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8float.v8double(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8f32.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) + %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) %v16f16_v16f32 = fpext <16 x half> undef to <16 x float> %v16f16_v16f64 = fpext <16 x half> undef to <16 x double> %v16f32_v16f64 = fpext <16 x float> undef to <16 x double> - %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16half.v16float(<16 x half> undef, <16 x i1> undef, i32 undef) - %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16half.v16double(<16 x half> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16float.v16double(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16f32.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) + %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) %v32f16_v32f32 = fpext <32 x half> undef to <32 x float> %v32f16_v32f64 = fpext <32 x half> undef to <32 x double> %v32f32_v32f64 = fpext <32 x float> undef to <32 x double> - %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32half.v32float(<32 x half> undef, <32 x i1> undef, i32 undef) - %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32half.v32double(<32 x half> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32float.v32double(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32f32.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef) + %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) %v64f16_v64f32 = fpext <64 x half> undef to <64 x float> %v64f16_v64f64 = fpext <64 x half> undef to <64 x double> %v64f32_v64f64 = fpext <64 x float> undef to <64 x double> - %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64half.v64float(<64 x half> undef, <64 x i1> undef, i32 undef) - %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64half.v64double(<64 x half> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64float.v64double(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64f32.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef) + %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) %v128f16_v128f32 = fpext <128 x half> undef to <128 x float> %v128f16_v128f64 = fpext <128 x half> undef to <128 x double> %v128f32_v128f64 = fpext <128 x float> undef to <128 x double> - %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128half.v128float(<128 x half> undef, <128 x i1> undef, i32 undef) - %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128half.v128double(<128 x half> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128float.v128double(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128f32.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef) + %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) %nxv1f16_nxv1f32 = fpext undef to %nxv1f16_nxv1f64 = fpext undef to %nxv1f32_nxv1f64 = fpext undef to - %vp_nxv1f16_nxv1f32 = call @llvm.vp.fpext.nxv1half.nxv1float( undef, undef, i32 undef) - %vp_nxv1f16_nxv1f64 = call @llvm.vp.fpext.nxv1half.nxv1double( undef, undef, i32 undef) - %vp_nxv1f32_nxv1f64 = call @llvm.vp.fpext.nxv1float.nxv1double( undef, undef, i32 undef) + %vp_nxv1f16_nxv1f32 = call @llvm.vp.fpext.nxv1f32.nxv1f16( undef, undef, i32 undef) + %vp_nxv1f16_nxv1f64 = call @llvm.vp.fpext.nxv1f64.nxv1f16( undef, undef, i32 undef) + %vp_nxv1f32_nxv1f64 = call @llvm.vp.fpext.nxv1f64.nxv1f32( undef, undef, i32 undef) %nxv2f16_nxv2f32 = fpext undef to %nxv2f16_nxv2f64 = fpext undef to %nxv2f32_nxv2f64 = fpext undef to - %vp_nxv2f16_nxv2f32 = call @llvm.vp.fpext.nxv2half.nxv2float( undef, undef, i32 undef) - %vp_nxv2f16_nxv2f64 = call @llvm.vp.fpext.nxv2half.nxv2double( undef, undef, i32 undef) - %vp_nxv2f32_nxv2f64 = call @llvm.vp.fpext.nxv2float.nxv2double( undef, undef, i32 undef) + %vp_nxv2f16_nxv2f32 = call @llvm.vp.fpext.nxv2f32.nxv2f16( undef, undef, i32 undef) + %vp_nxv2f16_nxv2f64 = call @llvm.vp.fpext.nxv2f64.nxv2f16( undef, undef, i32 undef) + %vp_nxv2f32_nxv2f64 = call @llvm.vp.fpext.nxv2f64.nxv2f32( undef, undef, i32 undef) %nxv4f16_nxv4f32 = fpext undef to %nxv4f16_nxv4f64 = fpext undef to %nxv4f32_nxv4f64 = fpext undef to - %vp_nxv4f16_nxv4f32 = call @llvm.vp.fpext.nxv4half.nxv4float( undef, undef, i32 undef) - %vp_nxv4f16_nxv4f64 = call @llvm.vp.fpext.nxv4half.nxv4double( undef, undef, i32 undef) - %vp_nxv4f32_nxv4f64 = call @llvm.vp.fpext.nxv4float.nxv4double( undef, undef, i32 undef) + %vp_nxv4f16_nxv4f32 = call @llvm.vp.fpext.nxv4f32.nxv4f16( undef, undef, i32 undef) + %vp_nxv4f16_nxv4f64 = call @llvm.vp.fpext.nxv4f64.nxv4f16( undef, undef, i32 undef) + %vp_nxv4f32_nxv4f64 = call @llvm.vp.fpext.nxv4f64.nxv4f32( undef, undef, i32 undef) %nxv8f16_nxv8f32 = fpext undef to %nxv8f16_nxv8f64 = fpext undef to %nxv8f32_nxv8f64 = fpext undef to - %vp_nxv8f16_nxv8f32 = call @llvm.vp.fpext.nxv8half.nxv8float( undef, undef, i32 undef) - %vp_nxv8f16_nxv8f64 = call @llvm.vp.fpext.nxv8half.nxv8double( undef, undef, i32 undef) - %vp_nxv8f32_nxv8f64 = call @llvm.vp.fpext.nxv8float.nxv8double( undef, undef, i32 undef) + %vp_nxv8f16_nxv8f32 = call @llvm.vp.fpext.nxv8f32.nxv8f16( undef, undef, i32 undef) + %vp_nxv8f16_nxv8f64 = call @llvm.vp.fpext.nxv8f64.nxv8f16( undef, undef, i32 undef) + %vp_nxv8f32_nxv8f64 = call @llvm.vp.fpext.nxv8f64.nxv8f32( undef, undef, i32 undef) %nxv16f16_nxv16f32 = fpext undef to %nxv16f16_nxv16f64 = fpext undef to %nxv16f32_nxv16f64 = fpext undef to - %vp_nxv16f16_nxv16f32 = call @llvm.vp.fpext.nxv16half.nxv16float( undef, undef, i32 undef) - %vp_nxv16f16_nxv16f64 = call @llvm.vp.fpext.nxv16half.nxv16double( undef, undef, i32 undef) - %vp_nxv16f32_nxv16f64 = call @llvm.vp.fpext.nxv16float.nxv16double( undef, undef, i32 undef) + %vp_nxv16f16_nxv16f32 = call @llvm.vp.fpext.nxv16f32.nxv16f16( undef, undef, i32 undef) + %vp_nxv16f16_nxv16f64 = call @llvm.vp.fpext.nxv16f64.nxv16f16( undef, undef, i32 undef) + %vp_nxv16f32_nxv16f64 = call @llvm.vp.fpext.nxv16f64.nxv16f32( undef, undef, i32 undef) %nxv32f16_nxv32f32 = fpext undef to %nxv32f16_nxv32f64 = fpext undef to %nxv32f32_nxv32f64 = fpext undef to - %vp_nxv32f16_nxv32f32 = call @llvm.vp.fpext.nxv32half.nxv32float( undef, undef, i32 undef) - %vp_nxv32f16_nxv32f64 = call @llvm.vp.fpext.nxv32half.nxv32double( undef, undef, i32 undef) - %vp_nxv32f32_nxv32f64 = call @llvm.vp.fpext.nxv32float.nxv32double( undef, undef, i32 undef) + %vp_nxv32f16_nxv32f32 = call @llvm.vp.fpext.nxv32f32.nxv32f16( undef, undef, i32 undef) + %vp_nxv32f16_nxv32f64 = call @llvm.vp.fpext.nxv32f64.nxv32f16( undef, undef, i32 undef) + %vp_nxv32f32_nxv32f64 = call @llvm.vp.fpext.nxv32f64.nxv32f32( undef, undef, i32 undef) %nxv64f16_nxv64f32 = fpext undef to %nxv64f16_nxv64f64 = fpext undef to %nxv64f32_nxv64f64 = fpext undef to - %vp_nxv64f16_nxv64f32 = call @llvm.vp.fpext.nxv64half.nxv64float( undef, undef, i32 undef) - %vp_nxv64f16_nxv64f64 = call @llvm.vp.fpext.nxv64half.nxv64double( undef, undef, i32 undef) - %vp_nxv64f32_nxv64f64 = call @llvm.vp.fpext.nxv64float.nxv64double( undef, undef, i32 undef) + %vp_nxv64f16_nxv64f32 = call @llvm.vp.fpext.nxv64f32.nxv64f16( undef, undef, i32 undef) + %vp_nxv64f16_nxv64f64 = call @llvm.vp.fpext.nxv64f64.nxv64f16( undef, undef, i32 undef) + %vp_nxv64f32_nxv64f64 = call @llvm.vp.fpext.nxv64f64.nxv64f32( undef, undef, i32 undef) ret void } @@ -3274,113 +3274,113 @@ define void @fptrunc() { %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half> %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float> - %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2float.v2half(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2double.v2half(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2double.v2float(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2f32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half> %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half> %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float> - %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4float.v4half(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4double.v4half(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4double.v4float(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4f32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half> %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half> %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float> - %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8float.v8half(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8double.v8half(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8double.v8float(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half> %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half> %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float> - %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16float.v16half(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16double.v16half(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16double.v16float(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16f32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half> %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half> %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float> - %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32float.v32half(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32double.v32half(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32double.v32float(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32f32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half> %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half> %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float> - %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64float.v64half(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64double.v64half(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64double.v64float(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64f32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half> %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half> %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float> - %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128float.v128half(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128double.v128half(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128double.v128float(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128f32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) %nxv1f32_nxv1f16 = fptrunc undef to %nxv1f64_nxv1f16 = fptrunc undef to %nxv1f64_nxv1f32 = fptrunc undef to - %vp_nxv1f32_nxv1f16 = call @llvm.vp.fptrunc.nxv1float.nxv1half( undef, undef, i32 undef) - %vp_nxv1f64_nxv1f16 = call @llvm.vp.fptrunc.nxv1double.nxv1half( undef, undef, i32 undef) - %vp_nxv1f64_nxv1f32 = call @llvm.vp.fptrunc.nxv1double.nxv1float( undef, undef, i32 undef) + %vp_nxv1f32_nxv1f16 = call @llvm.vp.fptrunc.nxv1f16.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1f16 = call @llvm.vp.fptrunc.nxv1f16.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f64_nxv1f32 = call @llvm.vp.fptrunc.nxv1f32.nxv1f64( undef, undef, i32 undef) %nxv2f32_nxv1f16 = fptrunc undef to %nxv2f64_nxv1f16 = fptrunc undef to %nxv2f64_nxv1f32 = fptrunc undef to - %vp_nxv2f32_nxv2f16 = call @llvm.vp.fptrunc.nxv2float.nxv2half( undef, undef, i32 undef) - %vp_nxv2f64_nxv2f16 = call @llvm.vp.fptrunc.nxv2double.nxv2half( undef, undef, i32 undef) - %vp_nxv2f64_nxv2f32 = call @llvm.vp.fptrunc.nxv2double.nxv2float( undef, undef, i32 undef) + %vp_nxv2f32_nxv2f16 = call @llvm.vp.fptrunc.nxv2f16.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2f16 = call @llvm.vp.fptrunc.nxv2f16.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f64_nxv2f32 = call @llvm.vp.fptrunc.nxv2f32.nxv2f64( undef, undef, i32 undef) %nxv4f32_nxv4f16 = fptrunc undef to %nxv4f64_nxv4f16 = fptrunc undef to %nxv4f64_nxv4f32 = fptrunc undef to - %vp_nxv4f32_nxv4f16 = call @llvm.vp.fptrunc.nxv4float.nxv4half( undef, undef, i32 undef) - %vp_nxv4f64_nxv4f16 = call @llvm.vp.fptrunc.nxv4double.nxv4half( undef, undef, i32 undef) - %vp_nxv4f64_nxv4f32 = call @llvm.vp.fptrunc.nxv4double.nxv4float( undef, undef, i32 undef) + %vp_nxv4f32_nxv4f16 = call @llvm.vp.fptrunc.nxv4f16.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4f16 = call @llvm.vp.fptrunc.nxv4f16.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f64_nxv4f32 = call @llvm.vp.fptrunc.nxv4f32.nxv4f64( undef, undef, i32 undef) %nxv8f32_nxv8f16 = fptrunc undef to %nxv8f64_nxv8f16 = fptrunc undef to %nxv8f64_nxv8f32 = fptrunc undef to - %vp_nxv8f32_nxv8f16 = call @llvm.vp.fptrunc.nxv8float.nxv8half( undef, undef, i32 undef) - %vp_nxv8f64_nxv8f16 = call @llvm.vp.fptrunc.nxv8double.nxv8half( undef, undef, i32 undef) - %vp_nxv8f64_nxv8f32 = call @llvm.vp.fptrunc.nxv8double.nxv8float( undef, undef, i32 undef) + %vp_nxv8f32_nxv8f16 = call @llvm.vp.fptrunc.nxv8f16.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8f16 = call @llvm.vp.fptrunc.nxv8f16.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f64_nxv8f32 = call @llvm.vp.fptrunc.nxv8f32.nxv8f64( undef, undef, i32 undef) %nxv16f32_nxv16f16 = fptrunc undef to %nxv16f64_nxv16f16 = fptrunc undef to %nxv16f64_nxv16f32 = fptrunc undef to - %vp_nxv16f32_nxv16f16 = call @llvm.vp.fptrunc.nxv16float.nxv16half( undef, undef, i32 undef) - %vp_nxv16f64_nxv16f16 = call @llvm.vp.fptrunc.nxv16double.nxv16half( undef, undef, i32 undef) - %vp_nxv16f64_nxv16f32 = call @llvm.vp.fptrunc.nxv16double.nxv16float( undef, undef, i32 undef) + %vp_nxv16f32_nxv16f16 = call @llvm.vp.fptrunc.nxv16f16.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16f16 = call @llvm.vp.fptrunc.nxv16f16.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f64_nxv16f32 = call @llvm.vp.fptrunc.nxv16f32.nxv16f64( undef, undef, i32 undef) %nxv32f32_nxv32f16 = fptrunc undef to %nxv32f64_nxv32f16 = fptrunc undef to %nxv32f64_nxv32f32 = fptrunc undef to - %vp_nxv32f32_nxv32f16 = call @llvm.vp.fptrunc.nxv32float.nxv32half( undef, undef, i32 undef) - %vp_nxv32f64_nxv32f16 = call @llvm.vp.fptrunc.nxv32double.nxv32half( undef, undef, i32 undef) - %vp_nxv32f64_nxv32f32 = call @llvm.vp.fptrunc.nxv32double.nxv32float( undef, undef, i32 undef) + %vp_nxv32f32_nxv32f16 = call @llvm.vp.fptrunc.nxv32f16.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32f16 = call @llvm.vp.fptrunc.nxv32f16.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f64_nxv32f32 = call @llvm.vp.fptrunc.nxv32f32.nxv32f64( undef, undef, i32 undef) %nxv64f32_nxv64f16 = fptrunc undef to %nxv64f64_nxv64f16 = fptrunc undef to %nxv64f64_nxv64f32 = fptrunc undef to - %vp_nxv64f32_nxv64f16 = call @llvm.vp.fptrunc.nxv64float.nxv64half( undef, undef, i32 undef) - %vp_nxv64f64_nxv64f16 = call @llvm.vp.fptrunc.nxv64double.nxv64half( undef, undef, i32 undef) - %vp_nxv64f64_nxv64f32 = call @llvm.vp.fptrunc.nxv64double.nxv64float( undef, undef, i32 undef) + %vp_nxv64f32_nxv64f16 = call @llvm.vp.fptrunc.nxv64f16.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64f16 = call @llvm.vp.fptrunc.nxv64f16.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f64_nxv64f32 = call @llvm.vp.fptrunc.nxv64f32.nxv64f64( undef, undef, i32 undef) ret void } @@ -3963,16 +3963,16 @@ define void @fptosi() { %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1> %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1> - %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8> %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8> @@ -3985,16 +3985,16 @@ define void @fptosi() { %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1> %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1> - %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8> %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8> @@ -4007,16 +4007,16 @@ define void @fptosi() { %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1> %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1> - %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8> %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8> @@ -4029,16 +4029,16 @@ define void @fptosi() { %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1> %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1> - %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8> %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8> @@ -4051,16 +4051,16 @@ define void @fptosi() { %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1> %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1> - %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8> %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8> @@ -4073,16 +4073,16 @@ define void @fptosi() { %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1> %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1> - %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8> %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8> @@ -4095,16 +4095,16 @@ define void @fptosi() { %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1> %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1> - %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) %nxv1f32_nxv1i8 = fptosi undef to %nxv1f64_nxv1i8 = fptosi undef to @@ -4117,16 +4117,16 @@ define void @fptosi() { %nxv1f32_nxv1i1 = fptosi undef to %nxv1f64_nxv1i1 = fptosi undef to - %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptosi.nxv1float.nxv1i8( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptosi.nxv1double.nxv1i8( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptosi.nxv1float.nxv1i16( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptosi.nxv1double.nxv1i16( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptosi.nxv1float.nxv1i32( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptosi.nxv1double.nxv1i32( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptosi.nxv1float.nxv1i64( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptosi.nxv1double.nxv1i64( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptosi.nxv1float.nxv1i1( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptosi.nxv1double.nxv1i1( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptosi.nxv1i8.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptosi.nxv1i8.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptosi.nxv1i16.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptosi.nxv1i16.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptosi.nxv1i32.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptosi.nxv1i32.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptosi.nxv1i64.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptosi.nxv1i64.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptosi.nxv1i1.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptosi.nxv1i1.nxv1f64( undef, undef, i32 undef) %nxv2f32_nxv2i8 = fptosi undef to %nxv2f64_nxv2i8 = fptosi undef to @@ -4139,16 +4139,16 @@ define void @fptosi() { %nxv2f32_nxv2i1 = fptosi undef to %nxv2f64_nxv2i1 = fptosi undef to - %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptosi.nxv2float.nxv2i8( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptosi.nxv2double.nxv2i8( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptosi.nxv2float.nxv2i16( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptosi.nxv2double.nxv2i16( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptosi.nxv2float.nxv2i32( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptosi.nxv2double.nxv2i32( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptosi.nxv2float.nxv2i64( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptosi.nxv2double.nxv2i64( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptosi.nxv2float.nxv2i1( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptosi.nxv2double.nxv2i1( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptosi.nxv2i8.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptosi.nxv2i8.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptosi.nxv2i16.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptosi.nxv2i16.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptosi.nxv2i32.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptosi.nxv2i32.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptosi.nxv2i64.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptosi.nxv2i64.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptosi.nxv2i1.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptosi.nxv2i1.nxv2f64( undef, undef, i32 undef) %nxv4f32_nxv4i8 = fptosi undef to %nxv4f64_nxv4i8 = fptosi undef to @@ -4161,16 +4161,16 @@ define void @fptosi() { %nxv4f32_nxv4i1 = fptosi undef to %nxv4f64_nxv4i1 = fptosi undef to - %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptosi.nxv4float.nxv4i8( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptosi.nxv4double.nxv4i8( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptosi.nxv4float.nxv4i16( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptosi.nxv4double.nxv4i16( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptosi.nxv4float.nxv4i32( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptosi.nxv4double.nxv4i32( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptosi.nxv4float.nxv4i64( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptosi.nxv4double.nxv4i64( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptosi.nxv4float.nxv4i1( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptosi.nxv4double.nxv4i1( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptosi.nxv4i8.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptosi.nxv4i8.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptosi.nxv4i16.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptosi.nxv4i16.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptosi.nxv4i32.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptosi.nxv4i32.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptosi.nxv4i64.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptosi.nxv4i64.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptosi.nxv4i1.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptosi.nxv4i1.nxv4f64( undef, undef, i32 undef) %nxv8f32_nxv8i8 = fptosi undef to %nxv8f64_nxv8i8 = fptosi undef to @@ -4183,16 +4183,16 @@ define void @fptosi() { %nxv8f32_nxv8i1 = fptosi undef to %nxv8f64_nxv8i1 = fptosi undef to - %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptosi.nxv8float.nxv8i8( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptosi.nxv8double.nxv8i8( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptosi.nxv8float.nxv8i16( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptosi.nxv8double.nxv8i16( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptosi.nxv8float.nxv8i32( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptosi.nxv8double.nxv8i32( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptosi.nxv8float.nxv8i64( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptosi.nxv8double.nxv8i64( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptosi.nxv8float.nxv8i1( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptosi.nxv8double.nxv8i1( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptosi.nxv8i8.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptosi.nxv8i8.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptosi.nxv8i16.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptosi.nxv8i16.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptosi.nxv8i32.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptosi.nxv8i32.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptosi.nxv8i64.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptosi.nxv8i64.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptosi.nxv8i1.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptosi.nxv8i1.nxv8f64( undef, undef, i32 undef) %nxv16f32_nxv16i8 = fptosi undef to %nxv16f64_nxv16i8 = fptosi undef to @@ -4205,16 +4205,16 @@ define void @fptosi() { %nxv16f32_nxv16i1 = fptosi undef to %nxv16f64_nxv16i1 = fptosi undef to - %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptosi.nxv16float.nxv16i8( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptosi.nxv16double.nxv16i8( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptosi.nxv16float.nxv16i16( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptosi.nxv16double.nxv16i16( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptosi.nxv16float.nxv16i32( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptosi.nxv16double.nxv16i32( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptosi.nxv16float.nxv16i64( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptosi.nxv16double.nxv16i64( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptosi.nxv16float.nxv16i1( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptosi.nxv16double.nxv16i1( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptosi.nxv16i8.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptosi.nxv16i8.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptosi.nxv16i16.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptosi.nxv16i16.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptosi.nxv16i32.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptosi.nxv16i32.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptosi.nxv16i64.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptosi.nxv16i64.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptosi.nxv16i1.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptosi.nxv16i1.nxv16f64( undef, undef, i32 undef) %nxv32f32_nxv32i8 = fptosi undef to %nxv32f64_nxv32i8 = fptosi undef to @@ -4227,16 +4227,16 @@ define void @fptosi() { %nxv32f32_nxv32i1 = fptosi undef to %nxv32f64_nxv32i1 = fptosi undef to - %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptosi.nxv32float.nxv32i8( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptosi.nxv32double.nxv32i8( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptosi.nxv32float.nxv32i16( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptosi.nxv32double.nxv32i16( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptosi.nxv32float.nxv32i32( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptosi.nxv32double.nxv32i32( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptosi.nxv32float.nxv32i64( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptosi.nxv32double.nxv32i64( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptosi.nxv32float.nxv32i1( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptosi.nxv32double.nxv32i1( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptosi.nxv32i8.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptosi.nxv32i8.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptosi.nxv32i16.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptosi.nxv32i16.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptosi.nxv32i32.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptosi.nxv32i32.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptosi.nxv32i64.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptosi.nxv32i64.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptosi.nxv32i1.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptosi.nxv32i1.nxv32f64( undef, undef, i32 undef) %nxv64f32_nxv64i8 = fptosi undef to %nxv64f64_nxv64i8 = fptosi undef to @@ -4249,16 +4249,16 @@ define void @fptosi() { %nxv64f32_nxv64i1 = fptosi undef to %nxv64f64_nxv64i1 = fptosi undef to - %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptosi.nxv64float.nxv64i8( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptosi.nxv64double.nxv64i8( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptosi.nxv64float.nxv64i16( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptosi.nxv64double.nxv64i16( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptosi.nxv64float.nxv64i32( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptosi.nxv64double.nxv64i32( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptosi.nxv64float.nxv64i64( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptosi.nxv64double.nxv64i64( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptosi.nxv64float.nxv64i1( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptosi.nxv64double.nxv64i1( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptosi.nxv64i8.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptosi.nxv64i8.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptosi.nxv64i16.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptosi.nxv64i16.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptosi.nxv64i32.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptosi.nxv64i32.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptosi.nxv64i64.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptosi.nxv64i64.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptosi.nxv64i1.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptosi.nxv64i1.nxv64f64( undef, undef, i32 undef) ret void } @@ -4841,16 +4841,16 @@ define void @fptoui() { %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1> %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1> - %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef) - %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef) - %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) + %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) + %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8> %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8> @@ -4863,16 +4863,16 @@ define void @fptoui() { %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1> %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1> - %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef) - %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef) - %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) + %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) + %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8> %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8> @@ -4885,16 +4885,16 @@ define void @fptoui() { %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1> %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1> - %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef) - %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef) - %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) + %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) + %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8> %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8> @@ -4907,16 +4907,16 @@ define void @fptoui() { %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1> %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1> - %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef) - %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef) - %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) + %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) + %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8> %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8> @@ -4929,16 +4929,16 @@ define void @fptoui() { %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1> %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1> - %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef) - %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef) - %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) + %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef) + %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef) %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8> %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8> @@ -4951,16 +4951,16 @@ define void @fptoui() { %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1> %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1> - %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef) - %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef) - %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) + %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef) + %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef) %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8> %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8> @@ -4973,16 +4973,16 @@ define void @fptoui() { %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1> %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1> - %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef) - %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef) - %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) + %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef) + %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef) %nxv1f32_nxv1i8 = fptoui undef to %nxv1f64_nxv1i8 = fptoui undef to @@ -4995,16 +4995,16 @@ define void @fptoui() { %nxv1f32_nxv1i1 = fptoui undef to %nxv1f64_nxv1i1 = fptoui undef to - %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptoui.nxv1float.nxv1i8( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptoui.nxv1double.nxv1i8( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptoui.nxv1float.nxv1i16( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptoui.nxv1double.nxv1i16( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptoui.nxv1float.nxv1i32( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptoui.nxv1double.nxv1i32( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptoui.nxv1float.nxv1i64( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptoui.nxv1double.nxv1i64( undef, undef, i32 undef) - %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptoui.nxv1float.nxv1i1( undef, undef, i32 undef) - %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptoui.nxv1double.nxv1i1( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i8 = call @llvm.vp.fptoui.nxv1i8.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i8 = call @llvm.vp.fptoui.nxv1i8.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i16 = call @llvm.vp.fptoui.nxv1i16.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i16 = call @llvm.vp.fptoui.nxv1i16.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i32 = call @llvm.vp.fptoui.nxv1i32.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i32 = call @llvm.vp.fptoui.nxv1i32.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i64 = call @llvm.vp.fptoui.nxv1i64.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i64 = call @llvm.vp.fptoui.nxv1i64.nxv1f64( undef, undef, i32 undef) + %vp_nxv1f32_nxv1i1 = call @llvm.vp.fptoui.nxv1i1.nxv1f32( undef, undef, i32 undef) + %vp_nxv1f64_nxv1i1 = call @llvm.vp.fptoui.nxv1i1.nxv1f64( undef, undef, i32 undef) %nxv2f32_nxv2i8 = fptoui undef to %nxv2f64_nxv2i8 = fptoui undef to @@ -5017,16 +5017,16 @@ define void @fptoui() { %nxv2f32_nxv2i1 = fptoui undef to %nxv2f64_nxv2i1 = fptoui undef to - %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptoui.nxv2float.nxv2i8( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptoui.nxv2double.nxv2i8( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptoui.nxv2float.nxv2i16( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptoui.nxv2double.nxv2i16( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptoui.nxv2float.nxv2i32( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptoui.nxv2double.nxv2i32( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptoui.nxv2float.nxv2i64( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptoui.nxv2double.nxv2i64( undef, undef, i32 undef) - %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptoui.nxv2float.nxv2i1( undef, undef, i32 undef) - %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptoui.nxv2double.nxv2i1( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i8 = call @llvm.vp.fptoui.nxv2i8.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i8 = call @llvm.vp.fptoui.nxv2i8.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i16 = call @llvm.vp.fptoui.nxv2i16.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i16 = call @llvm.vp.fptoui.nxv2i16.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i32 = call @llvm.vp.fptoui.nxv2i32.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i32 = call @llvm.vp.fptoui.nxv2i32.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i64 = call @llvm.vp.fptoui.nxv2i64.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i64 = call @llvm.vp.fptoui.nxv2i64.nxv2f64( undef, undef, i32 undef) + %vp_nxv2f32_nxv2i1 = call @llvm.vp.fptoui.nxv2i1.nxv2f32( undef, undef, i32 undef) + %vp_nxv2f64_nxv2i1 = call @llvm.vp.fptoui.nxv2i1.nxv2f64( undef, undef, i32 undef) %nxv4f32_nxv4i8 = fptoui undef to %nxv4f64_nxv4i8 = fptoui undef to @@ -5039,16 +5039,16 @@ define void @fptoui() { %nxv4f32_nxv4i1 = fptoui undef to %nxv4f64_nxv4i1 = fptoui undef to - %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptoui.nxv4float.nxv4i8( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptoui.nxv4double.nxv4i8( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptoui.nxv4float.nxv4i16( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptoui.nxv4double.nxv4i16( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptoui.nxv4float.nxv4i32( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptoui.nxv4double.nxv4i32( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptoui.nxv4float.nxv4i64( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptoui.nxv4double.nxv4i64( undef, undef, i32 undef) - %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptoui.nxv4float.nxv4i1( undef, undef, i32 undef) - %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptoui.nxv4double.nxv4i1( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i8 = call @llvm.vp.fptoui.nxv4i8.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i8 = call @llvm.vp.fptoui.nxv4i8.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i16 = call @llvm.vp.fptoui.nxv4i16.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i16 = call @llvm.vp.fptoui.nxv4i16.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i32 = call @llvm.vp.fptoui.nxv4i32.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i32 = call @llvm.vp.fptoui.nxv4i32.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i64 = call @llvm.vp.fptoui.nxv4i64.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i64 = call @llvm.vp.fptoui.nxv4i64.nxv4f64( undef, undef, i32 undef) + %vp_nxv4f32_nxv4i1 = call @llvm.vp.fptoui.nxv4i1.nxv4f32( undef, undef, i32 undef) + %vp_nxv4f64_nxv4i1 = call @llvm.vp.fptoui.nxv4i1.nxv4f64( undef, undef, i32 undef) %nxv8f32_nxv8i8 = fptoui undef to %nxv8f64_nxv8i8 = fptoui undef to @@ -5061,16 +5061,16 @@ define void @fptoui() { %nxv8f32_nxv8i1 = fptoui undef to %nxv8f64_nxv8i1 = fptoui undef to - %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptoui.nxv8float.nxv8i8( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptoui.nxv8double.nxv8i8( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptoui.nxv8float.nxv8i16( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptoui.nxv8double.nxv8i16( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptoui.nxv8float.nxv8i32( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptoui.nxv8double.nxv8i32( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptoui.nxv8float.nxv8i64( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptoui.nxv8double.nxv8i64( undef, undef, i32 undef) - %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptoui.nxv8float.nxv8i1( undef, undef, i32 undef) - %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptoui.nxv8double.nxv8i1( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i8 = call @llvm.vp.fptoui.nxv8i8.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i8 = call @llvm.vp.fptoui.nxv8i8.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i16 = call @llvm.vp.fptoui.nxv8i16.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i16 = call @llvm.vp.fptoui.nxv8i16.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i32 = call @llvm.vp.fptoui.nxv8i32.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i32 = call @llvm.vp.fptoui.nxv8i32.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i64 = call @llvm.vp.fptoui.nxv8i64.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i64 = call @llvm.vp.fptoui.nxv8i64.nxv8f64( undef, undef, i32 undef) + %vp_nxv8f32_nxv8i1 = call @llvm.vp.fptoui.nxv8i1.nxv8f32( undef, undef, i32 undef) + %vp_nxv8f64_nxv8i1 = call @llvm.vp.fptoui.nxv8i1.nxv8f64( undef, undef, i32 undef) %nxv16f32_nxv16i8 = fptoui undef to %nxv16f64_nxv16i8 = fptoui undef to @@ -5083,16 +5083,16 @@ define void @fptoui() { %nxv16f32_nxv16i1 = fptoui undef to %nxv16f64_nxv16i1 = fptoui undef to - %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptoui.nxv16float.nxv16i8( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptoui.nxv16double.nxv16i8( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptoui.nxv16float.nxv16i16( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptoui.nxv16double.nxv16i16( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptoui.nxv16float.nxv16i32( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptoui.nxv16double.nxv16i32( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptoui.nxv16float.nxv16i64( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptoui.nxv16double.nxv16i64( undef, undef, i32 undef) - %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptoui.nxv16float.nxv16i1( undef, undef, i32 undef) - %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptoui.nxv16double.nxv16i1( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i8 = call @llvm.vp.fptoui.nxv16i8.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i8 = call @llvm.vp.fptoui.nxv16i8.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i16 = call @llvm.vp.fptoui.nxv16i16.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i16 = call @llvm.vp.fptoui.nxv16i16.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i32 = call @llvm.vp.fptoui.nxv16i32.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i32 = call @llvm.vp.fptoui.nxv16i32.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i64 = call @llvm.vp.fptoui.nxv16i64.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i64 = call @llvm.vp.fptoui.nxv16i64.nxv16f64( undef, undef, i32 undef) + %vp_nxv16f32_nxv16i1 = call @llvm.vp.fptoui.nxv16i1.nxv16f32( undef, undef, i32 undef) + %vp_nxv16f64_nxv16i1 = call @llvm.vp.fptoui.nxv16i1.nxv16f64( undef, undef, i32 undef) %nxv32f32_nxv32i8 = fptoui undef to %nxv32f64_nxv32i8 = fptoui undef to @@ -5105,16 +5105,16 @@ define void @fptoui() { %nxv32f32_nxv32i1 = fptoui undef to %nxv32f64_nxv32i1 = fptoui undef to - %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptoui.nxv32float.nxv32i8( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptoui.nxv32double.nxv32i8( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptoui.nxv32float.nxv32i16( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptoui.nxv32double.nxv32i16( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptoui.nxv32float.nxv32i32( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptoui.nxv32double.nxv32i32( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptoui.nxv32float.nxv32i64( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptoui.nxv32double.nxv32i64( undef, undef, i32 undef) - %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptoui.nxv32float.nxv32i1( undef, undef, i32 undef) - %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptoui.nxv32double.nxv32i1( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i8 = call @llvm.vp.fptoui.nxv32i8.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i8 = call @llvm.vp.fptoui.nxv32i8.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i16 = call @llvm.vp.fptoui.nxv32i16.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i16 = call @llvm.vp.fptoui.nxv32i16.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i32 = call @llvm.vp.fptoui.nxv32i32.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i32 = call @llvm.vp.fptoui.nxv32i32.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i64 = call @llvm.vp.fptoui.nxv32i64.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i64 = call @llvm.vp.fptoui.nxv32i64.nxv32f64( undef, undef, i32 undef) + %vp_nxv32f32_nxv32i1 = call @llvm.vp.fptoui.nxv32i1.nxv32f32( undef, undef, i32 undef) + %vp_nxv32f64_nxv32i1 = call @llvm.vp.fptoui.nxv32i1.nxv32f64( undef, undef, i32 undef) %nxv64f32_nxv64i8 = fptoui undef to %nxv64f64_nxv64i8 = fptoui undef to @@ -5127,16 +5127,16 @@ define void @fptoui() { %nxv64f32_nxv64i1 = fptoui undef to %nxv64f64_nxv64i1 = fptoui undef to - %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptoui.nxv64float.nxv64i8( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptoui.nxv64double.nxv64i8( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptoui.nxv64float.nxv64i16( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptoui.nxv64double.nxv64i16( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptoui.nxv64float.nxv64i32( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptoui.nxv64double.nxv64i32( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptoui.nxv64float.nxv64i64( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptoui.nxv64double.nxv64i64( undef, undef, i32 undef) - %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptoui.nxv64float.nxv64i1( undef, undef, i32 undef) - %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptoui.nxv64double.nxv64i1( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i8 = call @llvm.vp.fptoui.nxv64i8.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i8 = call @llvm.vp.fptoui.nxv64i8.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i16 = call @llvm.vp.fptoui.nxv64i16.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i16 = call @llvm.vp.fptoui.nxv64i16.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i32 = call @llvm.vp.fptoui.nxv64i32.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i32 = call @llvm.vp.fptoui.nxv64i32.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i64 = call @llvm.vp.fptoui.nxv64i64.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i64 = call @llvm.vp.fptoui.nxv64i64.nxv64f64( undef, undef, i32 undef) + %vp_nxv64f32_nxv64i1 = call @llvm.vp.fptoui.nxv64i1.nxv64f32( undef, undef, i32 undef) + %vp_nxv64f64_nxv64i1 = call @llvm.vp.fptoui.nxv64i1.nxv64f64( undef, undef, i32 undef) ret void } @@ -5719,16 +5719,16 @@ define void @sitofp() { %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double> - %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef) - %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef) - %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float> %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double> @@ -5741,16 +5741,16 @@ define void @sitofp() { %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double> - %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef) - %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef) - %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float> %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double> @@ -5763,16 +5763,16 @@ define void @sitofp() { %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double> - %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef) - %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef) - %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float> %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double> @@ -5785,16 +5785,16 @@ define void @sitofp() { %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double> - %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef) - %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef) - %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float> %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double> @@ -5807,16 +5807,16 @@ define void @sitofp() { %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double> - %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef) - %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef) - %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float> %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double> @@ -5829,16 +5829,16 @@ define void @sitofp() { %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double> - %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef) - %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef) - %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float> %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double> @@ -5851,16 +5851,16 @@ define void @sitofp() { %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double> - %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef) - %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef) - %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) %nxv1i8_nxv1f32 = sitofp undef to %nxv1i8_nxv1f64 = sitofp undef to @@ -5873,16 +5873,16 @@ define void @sitofp() { %nxv1i1_nxv1f32 = sitofp undef to %nxv1i1_nxv1f64 = sitofp undef to - %vp_nxv1fi8_nxv1f32 = call @llvm.vp.sitofp.nxv1i8.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi8_nxv1f64 = call @llvm.vp.sitofp.nxv1i8.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi16_nxv1f32 = call @llvm.vp.sitofp.nxv1i16.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi16_nxv1f64 = call @llvm.vp.sitofp.nxv1i16.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi32_nxv1f32 = call @llvm.vp.sitofp.nxv1i32.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi32_nxv1f64 = call @llvm.vp.sitofp.nxv1i32.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi64_nxv1f32 = call @llvm.vp.sitofp.nxv1i64.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi64_nxv1f64 = call @llvm.vp.sitofp.nxv1i64.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi1_nxv1f32 = call @llvm.vp.sitofp.nxv1i1.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi1_nxv1f64 = call @llvm.vp.sitofp.nxv1i1.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi8_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) + %vp_nxv1fi8_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) %nxv2i8_nxv2f32 = sitofp undef to %nxv2i8_nxv2f64 = sitofp undef to @@ -5895,16 +5895,16 @@ define void @sitofp() { %nxv2i1_nxv2f32 = sitofp undef to %nxv2i1_nxv2f64 = sitofp undef to - %vp_nxv2fi8_nxv2f32 = call @llvm.vp.sitofp.nxv2i8.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi8_nxv2f64 = call @llvm.vp.sitofp.nxv2i8.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi16_nxv2f32 = call @llvm.vp.sitofp.nxv2i16.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi16_nxv2f64 = call @llvm.vp.sitofp.nxv2i16.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi32_nxv2f32 = call @llvm.vp.sitofp.nxv2i32.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi32_nxv2f64 = call @llvm.vp.sitofp.nxv2i32.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi64_nxv2f32 = call @llvm.vp.sitofp.nxv2i64.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi64_nxv2f64 = call @llvm.vp.sitofp.nxv2i64.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi1_nxv2f32 = call @llvm.vp.sitofp.nxv2i1.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi1_nxv2f64 = call @llvm.vp.sitofp.nxv2i1.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi8_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i8( undef, undef, i32 undef) + %vp_nxv2fi8_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i8( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i16( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i16( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i32( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i32( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i64( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i64( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f32 = call @llvm.vp.sitofp.nxv2f32.nxv2i1( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f64 = call @llvm.vp.sitofp.nxv2f64.nxv2i1( undef, undef, i32 undef) %nxv4i8_nxv4f32 = sitofp undef to %nxv4i8_nxv4f64 = sitofp undef to @@ -5917,16 +5917,16 @@ define void @sitofp() { %nxv4i1_nxv4f32 = sitofp undef to %nxv4i1_nxv4f64 = sitofp undef to - %vp_nxv4fi8_nxv4f32 = call @llvm.vp.sitofp.nxv4i8.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi8_nxv4f64 = call @llvm.vp.sitofp.nxv4i8.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi16_nxv4f32 = call @llvm.vp.sitofp.nxv4i16.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi16_nxv4f64 = call @llvm.vp.sitofp.nxv4i16.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi32_nxv4f32 = call @llvm.vp.sitofp.nxv4i32.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi32_nxv4f64 = call @llvm.vp.sitofp.nxv4i32.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi64_nxv4f32 = call @llvm.vp.sitofp.nxv4i64.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi64_nxv4f64 = call @llvm.vp.sitofp.nxv4i64.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi1_nxv4f32 = call @llvm.vp.sitofp.nxv4i1.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi1_nxv4f64 = call @llvm.vp.sitofp.nxv4i1.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi8_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i8( undef, undef, i32 undef) + %vp_nxv4fi8_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i8( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i16( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i16( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i32( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i32( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i64( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i64( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f32 = call @llvm.vp.sitofp.nxv4f32.nxv4i1( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f64 = call @llvm.vp.sitofp.nxv4f64.nxv4i1( undef, undef, i32 undef) %nxv8i8_nxv8f32 = sitofp undef to %nxv8i8_nxv8f64 = sitofp undef to @@ -5939,16 +5939,16 @@ define void @sitofp() { %nxv8i1_nxv8f32 = sitofp undef to %nxv8i1_nxv8f64 = sitofp undef to - %vp_nxv8fi8_nxv8f32 = call @llvm.vp.sitofp.nxv8i8.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi8_nxv8f64 = call @llvm.vp.sitofp.nxv8i8.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi16_nxv8f32 = call @llvm.vp.sitofp.nxv8i16.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi16_nxv8f64 = call @llvm.vp.sitofp.nxv8i16.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi32_nxv8f32 = call @llvm.vp.sitofp.nxv8i32.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi32_nxv8f64 = call @llvm.vp.sitofp.nxv8i32.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi64_nxv8f32 = call @llvm.vp.sitofp.nxv8i64.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi64_nxv8f64 = call @llvm.vp.sitofp.nxv8i64.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi1_nxv8f32 = call @llvm.vp.sitofp.nxv8i1.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi1_nxv8f64 = call @llvm.vp.sitofp.nxv8i1.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi8_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) + %vp_nxv8fi8_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f32 = call @llvm.vp.sitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f64 = call @llvm.vp.sitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) %nxv16i8_nxv16f32 = sitofp undef to %nxv16i8_nxv16f64 = sitofp undef to @@ -5961,16 +5961,16 @@ define void @sitofp() { %nxv16i1_nxv16f32 = sitofp undef to %nxv16i1_nxv16f64 = sitofp undef to - %vp_nxv16fi8_nxv16f32 = call @llvm.vp.sitofp.nxv16i8.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi8_nxv16f64 = call @llvm.vp.sitofp.nxv16i8.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi16_nxv16f32 = call @llvm.vp.sitofp.nxv16i16.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi16_nxv16f64 = call @llvm.vp.sitofp.nxv16i16.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi32_nxv16f32 = call @llvm.vp.sitofp.nxv16i32.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi32_nxv16f64 = call @llvm.vp.sitofp.nxv16i32.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi64_nxv16f32 = call @llvm.vp.sitofp.nxv16i64.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi64_nxv16f64 = call @llvm.vp.sitofp.nxv16i64.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi1_nxv16f32 = call @llvm.vp.sitofp.nxv16i1.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi1_nxv16f64 = call @llvm.vp.sitofp.nxv16i1.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi8_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i8( undef, undef, i32 undef) + %vp_nxv16fi8_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i8( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i16( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i16( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i32( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i32( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i64( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i64( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f32 = call @llvm.vp.sitofp.nxv16f32.nxv16i1( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f64 = call @llvm.vp.sitofp.nxv16f64.nxv16i1( undef, undef, i32 undef) %nxv32i8_nxv32f32 = sitofp undef to %nxv32i8_nxv32f64 = sitofp undef to @@ -5983,16 +5983,16 @@ define void @sitofp() { %nxv32i1_nxv32f32 = sitofp undef to %nxv32i1_nxv32f64 = sitofp undef to - %vp_nxv32fi8_nxv32f32 = call @llvm.vp.sitofp.nxv32i8.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi8_nxv32f64 = call @llvm.vp.sitofp.nxv32i8.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi16_nxv32f32 = call @llvm.vp.sitofp.nxv32i16.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi16_nxv32f64 = call @llvm.vp.sitofp.nxv32i16.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi32_nxv32f32 = call @llvm.vp.sitofp.nxv32i32.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi32_nxv32f64 = call @llvm.vp.sitofp.nxv32i32.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi64_nxv32f32 = call @llvm.vp.sitofp.nxv32i64.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi64_nxv32f64 = call @llvm.vp.sitofp.nxv32i64.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi1_nxv32f32 = call @llvm.vp.sitofp.nxv32i1.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi1_nxv32f64 = call @llvm.vp.sitofp.nxv32i1.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi8_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i8( undef, undef, i32 undef) + %vp_nxv32fi8_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i8( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i16( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i16( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i32( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i32( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i64( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i64( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f32 = call @llvm.vp.sitofp.nxv32f32.nxv32i1( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f64 = call @llvm.vp.sitofp.nxv32f64.nxv32i1( undef, undef, i32 undef) %nxv64i8_nxv64f32 = sitofp undef to %nxv64i8_nxv64f64 = sitofp undef to @@ -6005,16 +6005,16 @@ define void @sitofp() { %nxv64i1_nxv64f32 = sitofp undef to %nxv64i1_nxv64f64 = sitofp undef to - %vp_nxv64fi8_nxv64f32 = call @llvm.vp.sitofp.nxv64i8.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi8_nxv64f64 = call @llvm.vp.sitofp.nxv64i8.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi16_nxv64f32 = call @llvm.vp.sitofp.nxv64i16.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi16_nxv64f64 = call @llvm.vp.sitofp.nxv64i16.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi32_nxv64f32 = call @llvm.vp.sitofp.nxv64i32.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi32_nxv64f64 = call @llvm.vp.sitofp.nxv64i32.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi64_nxv64f32 = call @llvm.vp.sitofp.nxv64i64.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi64_nxv64f64 = call @llvm.vp.sitofp.nxv64i64.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi1_nxv64f32 = call @llvm.vp.sitofp.nxv64i1.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi1_nxv64f64 = call @llvm.vp.sitofp.nxv64i1.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi8_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i8( undef, undef, i32 undef) + %vp_nxv64fi8_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i8( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i16( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i16( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i32( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i32( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i64( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i64( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f32 = call @llvm.vp.sitofp.nxv64f32.nxv64i1( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f64 = call @llvm.vp.sitofp.nxv64f64.nxv64i1( undef, undef, i32 undef) ret void } @@ -6597,16 +6597,16 @@ define void @uitofp() { %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float> %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double> - %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef) - %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef) - %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef) - %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef) - %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef) - %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef) - %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) + %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef) %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float> %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double> @@ -6619,16 +6619,16 @@ define void @uitofp() { %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float> %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double> - %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef) - %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef) - %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef) - %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef) - %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef) - %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef) - %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) + %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef) %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float> %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double> @@ -6641,16 +6641,16 @@ define void @uitofp() { %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float> %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double> - %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef) - %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef) - %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef) - %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef) - %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef) - %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef) - %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) + %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef) %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float> %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double> @@ -6663,16 +6663,16 @@ define void @uitofp() { %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float> %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double> - %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef) - %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef) - %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef) - %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef) - %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef) - %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef) - %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) + %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef) %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float> %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double> @@ -6685,16 +6685,16 @@ define void @uitofp() { %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float> %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double> - %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef) - %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef) - %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef) - %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef) - %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef) - %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef) - %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) + %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef) %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float> %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double> @@ -6707,16 +6707,16 @@ define void @uitofp() { %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float> %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double> - %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef) - %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef) - %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef) - %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef) - %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef) - %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef) - %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) + %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef) %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float> %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double> @@ -6729,16 +6729,16 @@ define void @uitofp() { %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float> %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double> - %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef) - %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef) - %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef) - %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef) - %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef) - %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef) - %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) + %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef) %nxv1i8_nxv1f32 = uitofp undef to %nxv1i8_nxv1f64 = uitofp undef to @@ -6751,16 +6751,16 @@ define void @uitofp() { %nxv1i1_nxv1f32 = uitofp undef to %nxv1i1_nxv1f64 = uitofp undef to - %vp_nxv1fi8_nxv1f32 = call @llvm.vp.uitofp.nxv1i8.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi8_nxv1f64 = call @llvm.vp.uitofp.nxv1i8.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi16_nxv1f32 = call @llvm.vp.uitofp.nxv1i16.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi16_nxv1f64 = call @llvm.vp.uitofp.nxv1i16.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi32_nxv1f32 = call @llvm.vp.uitofp.nxv1i32.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi32_nxv1f64 = call @llvm.vp.uitofp.nxv1i32.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi64_nxv1f32 = call @llvm.vp.uitofp.nxv1i64.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi64_nxv1f64 = call @llvm.vp.uitofp.nxv1i64.nxv1double( undef, undef, i32 undef) - %vp_nxv1fi1_nxv1f32 = call @llvm.vp.uitofp.nxv1i1.nxv1float( undef, undef, i32 undef) - %vp_nxv1fi1_nxv1f64 = call @llvm.vp.uitofp.nxv1i1.nxv1double( undef, undef, i32 undef) + %vp_nxv1fi8_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i8( undef, undef, i32 undef) + %vp_nxv1fi8_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i8( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i16( undef, undef, i32 undef) + %vp_nxv1fi16_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i16( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i32( undef, undef, i32 undef) + %vp_nxv1fi32_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i32( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i64( undef, undef, i32 undef) + %vp_nxv1fi64_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i64( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f32 = call @llvm.vp.uitofp.nxv1f32.nxv1i1( undef, undef, i32 undef) + %vp_nxv1fi1_nxv1f64 = call @llvm.vp.uitofp.nxv1f64.nxv1i1( undef, undef, i32 undef) %nxv2i8_nxv2f32 = uitofp undef to %nxv2i8_nxv2f64 = uitofp undef to @@ -6773,16 +6773,16 @@ define void @uitofp() { %nxv2i1_nxv2f32 = uitofp undef to %nxv2i1_nxv2f64 = uitofp undef to - %vp_nxv2fi8_nxv2f32 = call @llvm.vp.uitofp.nxv2i8.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi8_nxv2f64 = call @llvm.vp.uitofp.nxv2i8.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi16_nxv2f32 = call @llvm.vp.uitofp.nxv2i16.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi16_nxv2f64 = call @llvm.vp.uitofp.nxv2i16.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi32_nxv2f32 = call @llvm.vp.uitofp.nxv2i32.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi32_nxv2f64 = call @llvm.vp.uitofp.nxv2i32.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi64_nxv2f32 = call @llvm.vp.uitofp.nxv2i64.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi64_nxv2f64 = call @llvm.vp.uitofp.nxv2i64.nxv2double( undef, undef, i32 undef) - %vp_nxv2fi1_nxv2f32 = call @llvm.vp.uitofp.nxv2i1.nxv2float( undef, undef, i32 undef) - %vp_nxv2fi1_nxv2f64 = call @llvm.vp.uitofp.nxv2i1.nxv2double( undef, undef, i32 undef) + %vp_nxv2fi8_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i8( undef, undef, i32 undef) + %vp_nxv2fi8_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i8( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i16( undef, undef, i32 undef) + %vp_nxv2fi16_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i16( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i32( undef, undef, i32 undef) + %vp_nxv2fi32_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i32( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i64( undef, undef, i32 undef) + %vp_nxv2fi64_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i64( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f32 = call @llvm.vp.uitofp.nxv2f32.nxv2i1( undef, undef, i32 undef) + %vp_nxv2fi1_nxv2f64 = call @llvm.vp.uitofp.nxv2f64.nxv2i1( undef, undef, i32 undef) %nxv4i8_nxv4f32 = uitofp undef to %nxv4i8_nxv4f64 = uitofp undef to @@ -6795,16 +6795,16 @@ define void @uitofp() { %nxv4i1_nxv4f32 = uitofp undef to %nxv4i1_nxv4f64 = uitofp undef to - %vp_nxv4fi8_nxv4f32 = call @llvm.vp.uitofp.nxv4i8.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi8_nxv4f64 = call @llvm.vp.uitofp.nxv4i8.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi16_nxv4f32 = call @llvm.vp.uitofp.nxv4i16.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi16_nxv4f64 = call @llvm.vp.uitofp.nxv4i16.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi32_nxv4f32 = call @llvm.vp.uitofp.nxv4i32.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi32_nxv4f64 = call @llvm.vp.uitofp.nxv4i32.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi64_nxv4f32 = call @llvm.vp.uitofp.nxv4i64.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi64_nxv4f64 = call @llvm.vp.uitofp.nxv4i64.nxv4double( undef, undef, i32 undef) - %vp_nxv4fi1_nxv4f32 = call @llvm.vp.uitofp.nxv4i1.nxv4float( undef, undef, i32 undef) - %vp_nxv4fi1_nxv4f64 = call @llvm.vp.uitofp.nxv4i1.nxv4double( undef, undef, i32 undef) + %vp_nxv4fi8_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i8( undef, undef, i32 undef) + %vp_nxv4fi8_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i8( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i16( undef, undef, i32 undef) + %vp_nxv4fi16_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i16( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i32( undef, undef, i32 undef) + %vp_nxv4fi32_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i32( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i64( undef, undef, i32 undef) + %vp_nxv4fi64_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i64( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f32 = call @llvm.vp.uitofp.nxv4f32.nxv4i1( undef, undef, i32 undef) + %vp_nxv4fi1_nxv4f64 = call @llvm.vp.uitofp.nxv4f64.nxv4i1( undef, undef, i32 undef) %nxv8i8_nxv8f32 = uitofp undef to %nxv8i8_nxv8f64 = uitofp undef to @@ -6817,16 +6817,16 @@ define void @uitofp() { %nxv8i1_nxv8f32 = uitofp undef to %nxv8i1_nxv8f64 = uitofp undef to - %vp_nxv8fi8_nxv8f32 = call @llvm.vp.uitofp.nxv8i8.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi8_nxv8f64 = call @llvm.vp.uitofp.nxv8i8.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi16_nxv8f32 = call @llvm.vp.uitofp.nxv8i16.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi16_nxv8f64 = call @llvm.vp.uitofp.nxv8i16.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi32_nxv8f32 = call @llvm.vp.uitofp.nxv8i32.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi32_nxv8f64 = call @llvm.vp.uitofp.nxv8i32.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi64_nxv8f32 = call @llvm.vp.uitofp.nxv8i64.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi64_nxv8f64 = call @llvm.vp.uitofp.nxv8i64.nxv8double( undef, undef, i32 undef) - %vp_nxv8fi1_nxv8f32 = call @llvm.vp.uitofp.nxv8i1.nxv8float( undef, undef, i32 undef) - %vp_nxv8fi1_nxv8f64 = call @llvm.vp.uitofp.nxv8i1.nxv8double( undef, undef, i32 undef) + %vp_nxv8fi8_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i8( undef, undef, i32 undef) + %vp_nxv8fi8_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i8( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i16( undef, undef, i32 undef) + %vp_nxv8fi16_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i16( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i32( undef, undef, i32 undef) + %vp_nxv8fi32_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i32( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i64( undef, undef, i32 undef) + %vp_nxv8fi64_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i64( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f32 = call @llvm.vp.uitofp.nxv8f32.nxv8i1( undef, undef, i32 undef) + %vp_nxv8fi1_nxv8f64 = call @llvm.vp.uitofp.nxv8f64.nxv8i1( undef, undef, i32 undef) %nxv16i8_nxv16f32 = uitofp undef to %nxv16i8_nxv16f64 = uitofp undef to @@ -6839,16 +6839,16 @@ define void @uitofp() { %nxv16i1_nxv16f32 = uitofp undef to %nxv16i1_nxv16f64 = uitofp undef to - %vp_nxv16fi8_nxv16f32 = call @llvm.vp.uitofp.nxv16i8.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi8_nxv16f64 = call @llvm.vp.uitofp.nxv16i8.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi16_nxv16f32 = call @llvm.vp.uitofp.nxv16i16.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi16_nxv16f64 = call @llvm.vp.uitofp.nxv16i16.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi32_nxv16f32 = call @llvm.vp.uitofp.nxv16i32.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi32_nxv16f64 = call @llvm.vp.uitofp.nxv16i32.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi64_nxv16f32 = call @llvm.vp.uitofp.nxv16i64.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi64_nxv16f64 = call @llvm.vp.uitofp.nxv16i64.nxv16double( undef, undef, i32 undef) - %vp_nxv16fi1_nxv16f32 = call @llvm.vp.uitofp.nxv16i1.nxv16float( undef, undef, i32 undef) - %vp_nxv16fi1_nxv16f64 = call @llvm.vp.uitofp.nxv16i1.nxv16double( undef, undef, i32 undef) + %vp_nxv16fi8_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i8( undef, undef, i32 undef) + %vp_nxv16fi8_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i8( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i16( undef, undef, i32 undef) + %vp_nxv16fi16_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i16( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i32( undef, undef, i32 undef) + %vp_nxv16fi32_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i32( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i64( undef, undef, i32 undef) + %vp_nxv16fi64_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i64( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f32 = call @llvm.vp.uitofp.nxv16f32.nxv16i1( undef, undef, i32 undef) + %vp_nxv16fi1_nxv16f64 = call @llvm.vp.uitofp.nxv16f64.nxv16i1( undef, undef, i32 undef) %nxv32i8_nxv32f32 = uitofp undef to %nxv32i8_nxv32f64 = uitofp undef to @@ -6861,16 +6861,16 @@ define void @uitofp() { %nxv32i1_nxv32f32 = uitofp undef to %nxv32i1_nxv32f64 = uitofp undef to - %vp_nxv32fi8_nxv32f32 = call @llvm.vp.uitofp.nxv32i8.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi8_nxv32f64 = call @llvm.vp.uitofp.nxv32i8.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi16_nxv32f32 = call @llvm.vp.uitofp.nxv32i16.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi16_nxv32f64 = call @llvm.vp.uitofp.nxv32i16.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi32_nxv32f32 = call @llvm.vp.uitofp.nxv32i32.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi32_nxv32f64 = call @llvm.vp.uitofp.nxv32i32.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi64_nxv32f32 = call @llvm.vp.uitofp.nxv32i64.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi64_nxv32f64 = call @llvm.vp.uitofp.nxv32i64.nxv32double( undef, undef, i32 undef) - %vp_nxv32fi1_nxv32f32 = call @llvm.vp.uitofp.nxv32i1.nxv32float( undef, undef, i32 undef) - %vp_nxv32fi1_nxv32f64 = call @llvm.vp.uitofp.nxv32i1.nxv32double( undef, undef, i32 undef) + %vp_nxv32fi8_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i8( undef, undef, i32 undef) + %vp_nxv32fi8_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i8( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i16( undef, undef, i32 undef) + %vp_nxv32fi16_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i16( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i32( undef, undef, i32 undef) + %vp_nxv32fi32_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i32( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i64( undef, undef, i32 undef) + %vp_nxv32fi64_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i64( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f32 = call @llvm.vp.uitofp.nxv32f32.nxv32i1( undef, undef, i32 undef) + %vp_nxv32fi1_nxv32f64 = call @llvm.vp.uitofp.nxv32f64.nxv32i1( undef, undef, i32 undef) %nxv64i8_nxv64f32 = uitofp undef to %nxv64i8_nxv64f64 = uitofp undef to @@ -6883,16 +6883,16 @@ define void @uitofp() { %nxv64i1_nxv64f32 = uitofp undef to %nxv64i1_nxv64f64 = uitofp undef to - %vp_nxv64fi8_nxv64f32 = call @llvm.vp.uitofp.nxv64i8.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi8_nxv64f64 = call @llvm.vp.uitofp.nxv64i8.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi16_nxv64f32 = call @llvm.vp.uitofp.nxv64i16.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi16_nxv64f64 = call @llvm.vp.uitofp.nxv64i16.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi32_nxv64f32 = call @llvm.vp.uitofp.nxv64i32.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi32_nxv64f64 = call @llvm.vp.uitofp.nxv64i32.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi64_nxv64f32 = call @llvm.vp.uitofp.nxv64i64.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi64_nxv64f64 = call @llvm.vp.uitofp.nxv64i64.nxv64double( undef, undef, i32 undef) - %vp_nxv64fi1_nxv64f32 = call @llvm.vp.uitofp.nxv64i1.nxv64float( undef, undef, i32 undef) - %vp_nxv64fi1_nxv64f64 = call @llvm.vp.uitofp.nxv64i1.nxv64double( undef, undef, i32 undef) + %vp_nxv64fi8_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i8( undef, undef, i32 undef) + %vp_nxv64fi8_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i8( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i16( undef, undef, i32 undef) + %vp_nxv64fi16_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i16( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i32( undef, undef, i32 undef) + %vp_nxv64fi32_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i32( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i64( undef, undef, i32 undef) + %vp_nxv64fi64_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i64( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f32 = call @llvm.vp.uitofp.nxv64f32.nxv64i1( undef, undef, i32 undef) + %vp_nxv64fi1_nxv64f64 = call @llvm.vp.uitofp.nxv64f64.nxv64i1( undef, undef, i32 undef) ret void } diff --git a/llvm/test/Analysis/CostModel/RISCV/cmp.ll b/llvm/test/Analysis/CostModel/RISCV/cmp.ll index bc3be27c0dbf522d1621457235d24dde345bd5a9..2ac7d0bc2106d756ea32b0b6db75b03ff19bcaaa 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cmp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cmp.ll @@ -415,83 +415,83 @@ define void @fcmp() { ; fcmp olt <2 x float> undef, undef fcmp olt <2 x double> undef, undef - call <2 x i1> @llvm.vp.fcmp.v2float(<2 x float> undef, <2 x float> undef, metadata !"olt", <2 x i1> undef, i32 undef) - call <2 x i1> @llvm.vp.fcmp.v2double(<2 x double> undef, <2 x double> undef, metadata !"olt", <2 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.fcmp.v2f32(<2 x float> undef, <2 x float> undef, metadata !"olt", <2 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.fcmp.v2f64(<2 x double> undef, <2 x double> undef, metadata !"olt", <2 x i1> undef, i32 undef) fcmp olt <4 x float> undef, undef fcmp olt <4 x double> undef, undef - call <4 x i1> @llvm.vp.fcmp.v4float(<4 x float> undef, <4 x float> undef, metadata !"olt", <4 x i1> undef, i32 undef) - call <4 x i1> @llvm.vp.fcmp.v4double(<4 x double> undef, <4 x double> undef, metadata !"olt", <4 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float> undef, <4 x float> undef, metadata !"olt", <4 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.fcmp.v4f64(<4 x double> undef, <4 x double> undef, metadata !"olt", <4 x i1> undef, i32 undef) fcmp olt <8 x float> undef, undef fcmp olt <8 x double> undef, undef - call <8 x i1> @llvm.vp.fcmp.v8float(<8 x float> undef, <8 x float> undef, metadata !"olt", <8 x i1> undef, i32 undef) - call <8 x i1> @llvm.vp.fcmp.v8double(<8 x double> undef, <8 x double> undef, metadata !"olt", <8 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> undef, <8 x float> undef, metadata !"olt", <8 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> undef, <8 x double> undef, metadata !"olt", <8 x i1> undef, i32 undef) fcmp olt <16 x float> undef, undef fcmp olt <16 x double> undef, undef - call <16 x i1> @llvm.vp.fcmp.v16float(<16 x float> undef, <16 x float> undef, metadata !"olt", <16 x i1> undef, i32 undef) - call <16 x i1> @llvm.vp.fcmp.v16double(<16 x double> undef, <16 x double> undef, metadata !"olt", <16 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.fcmp.v16f32(<16 x float> undef, <16 x float> undef, metadata !"olt", <16 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.fcmp.v16f64(<16 x double> undef, <16 x double> undef, metadata !"olt", <16 x i1> undef, i32 undef) fcmp olt <32 x float> undef, undef fcmp olt <32 x double> undef, undef - call <32 x i1> @llvm.vp.fcmp.v32float(<32 x float> undef, <32 x float> undef, metadata !"olt", <32 x i1> undef, i32 undef) - call <32 x i1> @llvm.vp.fcmp.v32double(<32 x double> undef, <32 x double> undef, metadata !"olt", <32 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.fcmp.v32f32(<32 x float> undef, <32 x float> undef, metadata !"olt", <32 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.fcmp.v32f64(<32 x double> undef, <32 x double> undef, metadata !"olt", <32 x i1> undef, i32 undef) fcmp olt <64 x float> undef, undef fcmp olt <64 x double> undef, undef - call <64 x i1> @llvm.vp.fcmp.v64float(<64 x float> undef, <64 x float> undef, metadata !"olt", <64 x i1> undef, i32 undef) - call <64 x i1> @llvm.vp.fcmp.v64double(<64 x double> undef, <64 x double> undef, metadata !"olt", <64 x i1> undef, i32 undef) + call <64 x i1> @llvm.vp.fcmp.v64f32(<64 x float> undef, <64 x float> undef, metadata !"olt", <64 x i1> undef, i32 undef) + call <64 x i1> @llvm.vp.fcmp.v64f64(<64 x double> undef, <64 x double> undef, metadata !"olt", <64 x i1> undef, i32 undef) fcmp olt <128 x float> undef, undef fcmp olt <128 x double> undef, undef - call <128 x i1> @llvm.vp.fcmp.v128float(<128 x float> undef, <128 x float> undef, metadata !"olt", <128 x i1> undef, i32 undef) - call <128 x i1> @llvm.vp.fcmp.v128double(<128 x double> undef, <128 x double> undef, metadata !"olt", <128 x i1> undef, i32 undef) + call <128 x i1> @llvm.vp.fcmp.v128f32(<128 x float> undef, <128 x float> undef, metadata !"olt", <128 x i1> undef, i32 undef) + call <128 x i1> @llvm.vp.fcmp.v128f64(<128 x double> undef, <128 x double> undef, metadata !"olt", <128 x i1> undef, i32 undef) fcmp olt <256 x float> undef, undef fcmp olt <256 x double> undef, undef - call <256 x i1> @llvm.vp.fcmp.v256float(<256 x float> undef, <256 x float> undef, metadata !"olt", <256 x i1> undef, i32 undef) - call <256 x i1> @llvm.vp.fcmp.v256double(<256 x double> undef, <256 x double> undef, metadata !"olt", <256 x i1> undef, i32 undef) + call <256 x i1> @llvm.vp.fcmp.v256f32(<256 x float> undef, <256 x float> undef, metadata !"olt", <256 x i1> undef, i32 undef) + call <256 x i1> @llvm.vp.fcmp.v256f64(<256 x double> undef, <256 x double> undef, metadata !"olt", <256 x i1> undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv1float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv1double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv1f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv1f64( undef, undef, metadata !"olt", undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv2float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv2double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv2f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv2f64( undef, undef, metadata !"olt", undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv4float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv4double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv4f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv4f64( undef, undef, metadata !"olt", undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv8float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv8double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv8f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv8f64( undef, undef, metadata !"olt", undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv16float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv16double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv16f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv16f64( undef, undef, metadata !"olt", undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv32float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv32double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv32f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv32f64( undef, undef, metadata !"olt", undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv64float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv64double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv64f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv64f64( undef, undef, metadata !"olt", undef, i32 undef) fcmp olt undef, undef fcmp olt undef, undef - call @llvm.vp.fcmp.nxv128float( undef, undef, metadata !"olt", undef, i32 undef) - call @llvm.vp.fcmp.nxv128double( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv128f32( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv128f64( undef, undef, metadata !"olt", undef, i32 undef) ret void } diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll index 25fd3d83315876400e7c4056239cc59aba0373b4..911a991bcef1f882d57a890b6968fb40f4a8577f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll @@ -1,17 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define void @sqrt() { ; CHECK-LABEL: 'sqrt' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.sqrt.nxv2f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.sqrt.nxv4f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.sqrt.nxv8f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call @llvm.sqrt.nxv16f16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.sqrt.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.sqrt.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.sqrt.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call @llvm.sqrt.nxv16bf16( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) @@ -33,15 +34,15 @@ define void @sqrt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call @llvm.sqrt.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; - call half @llvm.sqrt.f16(half undef) - call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef) - call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef) - call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef) - call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef) - call @llvm.sqrt.nvx2f16( undef) - call @llvm.sqrt.nvx4f16( undef) - call @llvm.sqrt.nvx8f16( undef) - call @llvm.sqrt.nvx16f16( undef) + call bfloat @llvm.sqrt.bf16(bfloat undef) + call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef) + call @llvm.sqrt.nvx2bf16( undef) + call @llvm.sqrt.nvx4bf16( undef) + call @llvm.sqrt.nvx8bf16( undef) + call @llvm.sqrt.nvx16bf16( undef) call float @llvm.sqrt.f32(float undef) call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef) call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) @@ -64,58 +65,74 @@ define void @sqrt() { ret void } -declare half @llvm.sqrt.f16(half) -declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) -declare <4 x half> @llvm.sqrt.v4f16(<4 x half>) -declare <8 x half> @llvm.sqrt.v8f16(<8 x half>) -declare <16 x half> @llvm.sqrt.v16f16(<16 x half>) -declare @llvm.sqrt.nvx2f16() -declare @llvm.sqrt.nvx4f16() -declare @llvm.sqrt.nvx8f16() -declare @llvm.sqrt.nvx16f16() -declare float @llvm.sqrt.f32(float) -declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) -declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) -declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) -declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) -declare @llvm.sqrt.nvx1f32() -declare @llvm.sqrt.nvx2f32() -declare @llvm.sqrt.nvx4f32() -declare @llvm.sqrt.nvx8f32() -declare @llvm.sqrt.nvx16f32() -declare double @llvm.sqrt.f64(double) -declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) -declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) -declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) -declare <16 x double> @llvm.sqrt.v16f64(<16 x double>) -declare @llvm.sqrt.nvx1f64() -declare @llvm.sqrt.nvx2f64() -declare @llvm.sqrt.nvx4f64() -declare @llvm.sqrt.nvx8f64() +define void @sqrt_f16() { +; CHECK-LABEL: 'sqrt_f16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.sqrt.nxv2f16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.sqrt.nxv4f16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.sqrt.nxv8f16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call @llvm.sqrt.nxv16f16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.sqrt.f16(half undef) + call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef) + call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef) + call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef) + call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef) + call @llvm.sqrt.nvx2f16( undef) + call @llvm.sqrt.nvx4f16( undef) + call @llvm.sqrt.nvx8f16( undef) + call @llvm.sqrt.nvx16f16( undef) + ret void +} define void @pow() { ; CHECK-LABEL: 'pow' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.pow.f32(float undef, float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.pow.nxv1f32( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.pow.nxv2f32( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.pow.nxv4f32( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.pow.nxv8f32( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.pow.nxv16f32( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.pow.f64(double undef, double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.pow.nxv1f64( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.pow.nxv2f64( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.pow.nxv4f64( undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.pow.nxv8f64( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.pow.nxv1bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.pow.nxv2bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.pow.nxv4bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.pow.nxv8bf16( undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.pow.nxv16bf16( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.pow.f32(float undef, float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.pow.nxv1f32( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.pow.nxv2f32( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.pow.nxv4f32( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.pow.nxv8f32( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.pow.nxv16f32( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.pow.f64(double undef, double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.pow.nxv1f64( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.pow.nxv2f64( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.pow.nxv4f64( undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.pow.nxv8f64( undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef) + call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef) + call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef) + call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef) + call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef) + call @llvm.pow.nvx1bf16( undef, undef) + call @llvm.pow.nvx2bf16( undef, undef) + call @llvm.pow.nvx4bf16( undef, undef) + call @llvm.pow.nvx8bf16( undef, undef) + call @llvm.pow.nvx16bf16( undef, undef) call float @llvm.pow.f32(float undef, float undef) call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef) call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef) @@ -138,22 +155,42 @@ define void @pow() { ret void } -declare float @llvm.pow.f32(float, float) -declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>) -declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>) -declare <8 x float> @llvm.pow.v8f32(<8 x float>, <8 x float>) -declare <16 x float> @llvm.pow.v16f32(<16 x float>, <16 x float>) -declare @llvm.pow.nvx1f32(, ) -declare @llvm.pow.nvx2f32(, ) -declare @llvm.pow.nvx4f32(, ) -declare @llvm.pow.nvx8f32(, ) -declare @llvm.pow.nvx16f32(, ) -declare double @llvm.pow.f64(double, double) -declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) -declare <4 x double> @llvm.pow.v4f64(<4 x double>, <4 x double>) -declare <8 x double> @llvm.pow.v8f64(<8 x double>, <8 x double>) -declare <16 x double> @llvm.pow.v16f64(<16 x double>, <16 x double>) -declare @llvm.pow.nvx1f64(, ) -declare @llvm.pow.nvx2f64(, ) -declare @llvm.pow.nvx4f64(, ) -declare @llvm.pow.nvx8f64(, ) +define void @pow_f16() { +; ZVFH-LABEL: 'pow_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.pow.nxv1f16( undef, undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.pow.nxv2f16( undef, undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.pow.nxv4f16( undef, undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.pow.nxv8f16( undef, undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.pow.nxv16f16( undef, undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'pow_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.pow.nxv1f16( undef, undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.pow.nxv2f16( undef, undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.pow.nxv4f16( undef, undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.pow.nxv8f16( undef, undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.pow.nxv16f16( undef, undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.pow.f16(half undef, half undef) + call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef) + call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef) + call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef) + call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef) + call @llvm.pow.nvx1f16( undef, undef) + call @llvm.pow.nvx2f16( undef, undef) + call @llvm.pow.nvx4f16( undef, undef) + call @llvm.pow.nvx8f16( undef, undef) + call @llvm.pow.nvx16f16( undef, undef) + ret void +} diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll index ce775e37d62a9c150fa0e8529331fe55143e4abf..f81d4f3f55613659a6de1b2e0fc2570df2108248 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll @@ -1,29 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define void @sin() { ; CHECK-LABEL: 'sin' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.sin.f32(float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.sin.nxv1f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.sin.nxv2f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.sin.nxv4f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.sin.nxv8f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.sin.nxv16f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.sin.f64(double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.sin.nxv1f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.sin.nxv2f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.sin.nxv4f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.sin.nxv8f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.sin.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.sin.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.sin.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.sin.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.sin.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.sin.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.sin.f32(float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.sin.nxv1f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.sin.nxv2f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.sin.nxv4f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.sin.nxv8f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.sin.nxv16f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.sin.f64(double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.sin.nxv1f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.sin.nxv2f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.sin.nxv4f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.sin.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.sin.bf16(bfloat undef) + call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef) + call @llvm.sin.nvx1bf16( undef) + call @llvm.sin.nvx2bf16( undef) + call @llvm.sin.nvx4bf16( undef) + call @llvm.sin.nvx8bf16( undef) + call @llvm.sin.nvx16bf16( undef) call float @llvm.sin.f32(float undef) call <2 x float> @llvm.sin.v2f32(<2 x float> undef) call <4 x float> @llvm.sin.v4f32(<4 x float> undef) @@ -46,29 +67,86 @@ define void @sin() { ret void } +define void @sin_f16() { +; ZVFH-LABEL: 'sin_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sin.f16(half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.sin.nxv1f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.sin.nxv2f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.sin.nxv4f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.sin.nxv8f16( undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'sin_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.sin.f16(half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.sin.nxv1f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.sin.nxv2f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.sin.nxv4f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.sin.nxv8f16( undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.sin.f16(half undef) + call <2 x half> @llvm.sin.v2f16(<2 x half> undef) + call <4 x half> @llvm.sin.v4f16(<4 x half> undef) + call <8 x half> @llvm.sin.v8f16(<8 x half> undef) + call <16 x half> @llvm.sin.v16f16(<16 x half> undef) + call @llvm.sin.nvx1f16( undef) + call @llvm.sin.nvx2f16( undef) + call @llvm.sin.nvx4f16( undef) + call @llvm.sin.nvx8f16( undef) + ret void +} + define void @cos() { ; CHECK-LABEL: 'cos' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.cos.f32(float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.cos.nxv1f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.cos.nxv2f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.cos.nxv4f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.cos.nxv8f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.cos.nxv16f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.cos.f64(double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.cos.nxv1f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.cos.nxv2f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.cos.nxv4f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.cos.nxv8f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.cos.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.cos.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.cos.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.cos.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.cos.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.cos.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.cos.f32(float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.cos.nxv1f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.cos.nxv2f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.cos.nxv4f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.cos.nxv8f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.cos.nxv16f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.cos.f64(double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.cos.nxv1f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.cos.nxv2f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.cos.nxv4f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.cos.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.cos.bf16(bfloat undef) + call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef) + call @llvm.cos.nvx1bf16( undef) + call @llvm.cos.nvx2bf16( undef) + call @llvm.cos.nvx4bf16( undef) + call @llvm.cos.nvx8bf16( undef) + call @llvm.cos.nvx16bf16( undef) call float @llvm.cos.f32(float undef) call <2 x float> @llvm.cos.v2f32(<2 x float> undef) call <4 x float> @llvm.cos.v4f32(<4 x float> undef) @@ -91,29 +169,86 @@ define void @cos() { ret void } +define void @cos_f16() { +; ZVFH-LABEL: 'cos_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.cos.f16(half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.cos.nxv1f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.cos.nxv2f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.cos.nxv4f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.cos.nxv8f16( undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'cos_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.cos.f16(half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.cos.nxv1f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.cos.nxv2f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.cos.nxv4f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.cos.nxv8f16( undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.cos.f16(half undef) + call <2 x half> @llvm.cos.v2f16(<2 x half> undef) + call <4 x half> @llvm.cos.v4f16(<4 x half> undef) + call <8 x half> @llvm.cos.v8f16(<8 x half> undef) + call <16 x half> @llvm.cos.v16f16(<16 x half> undef) + call @llvm.cos.nvx1f16( undef) + call @llvm.cos.nvx2f16( undef) + call @llvm.cos.nvx4f16( undef) + call @llvm.cos.nvx8f16( undef) + ret void +} + define void @exp() { ; CHECK-LABEL: 'exp' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp.f32(float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.exp.nxv1f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.exp.nxv2f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.exp.nxv4f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.exp.nxv8f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.exp.nxv16f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp.f64(double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.exp.nxv1f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.exp.nxv2f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.exp.nxv4f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.exp.nxv8f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.exp.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.exp.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.exp.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.exp.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.exp.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp.f32(float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.exp.nxv1f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.exp.nxv2f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.exp.nxv4f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.exp.nxv8f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.exp.nxv16f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp.f64(double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.exp.nxv1f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.exp.nxv2f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.exp.nxv4f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.exp.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.exp.bf16(bfloat undef) + call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef) + call @llvm.exp.nvx1bf16( undef) + call @llvm.exp.nvx2bf16( undef) + call @llvm.exp.nvx4bf16( undef) + call @llvm.exp.nvx8bf16( undef) + call @llvm.exp.nvx16bf16( undef) call float @llvm.exp.f32(float undef) call <2 x float> @llvm.exp.v2f32(<2 x float> undef) call <4 x float> @llvm.exp.v4f32(<4 x float> undef) @@ -136,29 +271,86 @@ define void @exp() { ret void } +define void @exp_f16() { +; ZVFH-LABEL: 'exp_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp.f16(half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.exp.nxv1f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.exp.nxv2f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.exp.nxv4f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.exp.nxv8f16( undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'exp_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp.f16(half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.exp.nxv1f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.exp.nxv2f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.exp.nxv4f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.exp.nxv8f16( undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.exp.f16(half undef) + call <2 x half> @llvm.exp.v2f16(<2 x half> undef) + call <4 x half> @llvm.exp.v4f16(<4 x half> undef) + call <8 x half> @llvm.exp.v8f16(<8 x half> undef) + call <16 x half> @llvm.exp.v16f16(<16 x half> undef) + call @llvm.exp.nvx1f16( undef) + call @llvm.exp.nvx2f16( undef) + call @llvm.exp.nvx4f16( undef) + call @llvm.exp.nvx8f16( undef) + ret void +} + define void @exp2() { ; CHECK-LABEL: 'exp2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp2.f32(float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.exp2.nxv1f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.exp2.nxv2f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.exp2.nxv4f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.exp2.nxv8f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.exp2.nxv16f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp2.f64(double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.exp2.nxv1f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.exp2.nxv2f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.exp2.nxv4f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.exp2.nxv8f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp2.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.exp2.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.exp2.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.exp2.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.exp2.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.exp2.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp2.f32(float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.exp2.nxv1f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.exp2.nxv2f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.exp2.nxv4f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.exp2.nxv8f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.exp2.nxv16f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp2.f64(double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.exp2.nxv1f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.exp2.nxv2f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.exp2.nxv4f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.exp2.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.exp2.bf16(bfloat undef) + call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef) + call @llvm.exp2.nvx1bf16( undef) + call @llvm.exp2.nvx2bf16( undef) + call @llvm.exp2.nvx4bf16( undef) + call @llvm.exp2.nvx8bf16( undef) + call @llvm.exp2.nvx16bf16( undef) call float @llvm.exp2.f32(float undef) call <2 x float> @llvm.exp2.v2f32(<2 x float> undef) call <4 x float> @llvm.exp2.v4f32(<4 x float> undef) @@ -181,29 +373,86 @@ define void @exp2() { ret void } +define void @exp2_f16() { +; ZVFH-LABEL: 'exp2_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp2.f16(half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.exp2.nxv1f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.exp2.nxv2f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.exp2.nxv4f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.exp2.nxv8f16( undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'exp2_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp2.f16(half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.exp2.nxv1f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.exp2.nxv2f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.exp2.nxv4f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.exp2.nxv8f16( undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.exp2.f16(half undef) + call <2 x half> @llvm.exp2.v2f16(<2 x half> undef) + call <4 x half> @llvm.exp2.v4f16(<4 x half> undef) + call <8 x half> @llvm.exp2.v8f16(<8 x half> undef) + call <16 x half> @llvm.exp2.v16f16(<16 x half> undef) + call @llvm.exp2.nvx1f16( undef) + call @llvm.exp2.nvx2f16( undef) + call @llvm.exp2.nvx4f16( undef) + call @llvm.exp2.nvx8f16( undef) + ret void +} + define void @log() { ; CHECK-LABEL: 'log' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log.f32(float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log.v16f32(<16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.log.nxv1f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.log.nxv2f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.log.nxv4f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.log.nxv8f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.log.nxv16f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log.f64(double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log.v8f64(<8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log.v16f64(<16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.log.nxv1f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.log.nxv2f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.log.nxv4f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.log.nxv8f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.log.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log.f32(float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log.v16f32(<16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.log.nxv1f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.log.nxv2f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.log.nxv4f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.log.nxv8f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.log.nxv16f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log.f64(double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log.v8f64(<8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log.v16f64(<16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.log.nxv1f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.log.nxv2f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.log.nxv4f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.log.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.log.bf16(bfloat undef) + call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef) + call @llvm.log.nvx1bf16( undef) + call @llvm.log.nvx2bf16( undef) + call @llvm.log.nvx4bf16( undef) + call @llvm.log.nvx8bf16( undef) + call @llvm.log.nvx16bf16( undef) call float @llvm.log.f32(float undef) call <2 x float> @llvm.log.v2f32(<2 x float> undef) call <4 x float> @llvm.log.v4f32(<4 x float> undef) @@ -226,29 +475,86 @@ define void @log() { ret void } +define void @log_f16() { +; ZVFH-LABEL: 'log_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log.f16(half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log.nxv1f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log.nxv2f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log.nxv4f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log.nxv8f16( undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'log_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log.f16(half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log.nxv1f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log.nxv2f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log.nxv4f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log.nxv8f16( undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.log.f16(half undef) + call <2 x half> @llvm.log.v2f16(<2 x half> undef) + call <4 x half> @llvm.log.v4f16(<4 x half> undef) + call <8 x half> @llvm.log.v8f16(<8 x half> undef) + call <16 x half> @llvm.log.v16f16(<16 x half> undef) + call @llvm.log.nvx1f16( undef) + call @llvm.log.nvx2f16( undef) + call @llvm.log.nvx4f16( undef) + call @llvm.log.nvx8f16( undef) + ret void +} + define void @log10() { ; CHECK-LABEL: 'log10' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log10.f32(float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.log10.nxv1f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.log10.nxv2f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.log10.nxv4f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.log10.nxv8f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.log10.nxv16f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log10.f64(double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.log10.nxv1f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.log10.nxv2f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.log10.nxv4f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.log10.nxv8f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log10.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log10.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log10.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log10.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log10.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.log10.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log10.f32(float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.log10.nxv1f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.log10.nxv2f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.log10.nxv4f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.log10.nxv8f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.log10.nxv16f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log10.f64(double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.log10.nxv1f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.log10.nxv2f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.log10.nxv4f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.log10.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.log10.bf16(bfloat undef) + call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef) + call @llvm.log10.nvx1bf16( undef) + call @llvm.log10.nvx2bf16( undef) + call @llvm.log10.nvx4bf16( undef) + call @llvm.log10.nvx8bf16( undef) + call @llvm.log10.nvx16bf16( undef) call float @llvm.log10.f32(float undef) call <2 x float> @llvm.log10.v2f32(<2 x float> undef) call <4 x float> @llvm.log10.v4f32(<4 x float> undef) @@ -271,29 +577,86 @@ define void @log10() { ret void } +define void @log10_f16() { +; ZVFH-LABEL: 'log10_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log10.f16(half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log10.nxv1f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log10.nxv2f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log10.nxv4f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log10.nxv8f16( undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'log10_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log10.f16(half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log10.nxv1f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log10.nxv2f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log10.nxv4f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log10.nxv8f16( undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.log10.f16(half undef) + call <2 x half> @llvm.log10.v2f16(<2 x half> undef) + call <4 x half> @llvm.log10.v4f16(<4 x half> undef) + call <8 x half> @llvm.log10.v8f16(<8 x half> undef) + call <16 x half> @llvm.log10.v16f16(<16 x half> undef) + call @llvm.log10.nvx1f16( undef) + call @llvm.log10.nvx2f16( undef) + call @llvm.log10.nvx4f16( undef) + call @llvm.log10.nvx8f16( undef) + ret void +} + define void @log2() { ; CHECK-LABEL: 'log2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log2.f32(float undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call @llvm.log2.nxv1f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call @llvm.log2.nxv2f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call @llvm.log2.nxv4f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call @llvm.log2.nxv8f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.log2.nxv16f32( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log2.f64(double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.log2.nxv1f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.log2.nxv2f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.log2.nxv4f64( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.log2.nxv8f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log2.bf16(bfloat undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log2.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log2.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log2.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log2.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.log2.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log2.f32(float undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call @llvm.log2.nxv1f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.log2.nxv2f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.log2.nxv4f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.log2.nxv8f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.log2.nxv16f32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log2.f64(double undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.log2.nxv1f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.log2.nxv2f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.log2.nxv4f64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.log2.nxv8f64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + call bfloat @llvm.log2.bf16(bfloat undef) + call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef) + call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef) + call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef) + call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef) + call @llvm.log2.nvx1bf16( undef) + call @llvm.log2.nvx2bf16( undef) + call @llvm.log2.nvx4bf16( undef) + call @llvm.log2.nvx8bf16( undef) + call @llvm.log2.nvx16bf16( undef) call float @llvm.log2.f32(float undef) call <2 x float> @llvm.log2.v2f32(<2 x float> undef) call <4 x float> @llvm.log2.v4f32(<4 x float> undef) @@ -316,142 +679,40 @@ define void @log2() { ret void } -declare float @llvm.sin.f32(float) -declare <2 x float> @llvm.sin.v2f32(<2 x float>) -declare <4 x float> @llvm.sin.v4f32(<4 x float>) -declare <8 x float> @llvm.sin.v8f32(<8 x float>) -declare <16 x float> @llvm.sin.v16f32(<16 x float>) -declare @llvm.sin.nvx1f32() -declare @llvm.sin.nvx2f32() -declare @llvm.sin.nvx4f32() -declare @llvm.sin.nvx8f32() -declare @llvm.sin.nvx16f32() -declare double @llvm.sin.f64(double) -declare <2 x double> @llvm.sin.v2f64(<2 x double>) -declare <4 x double> @llvm.sin.v4f64(<4 x double>) -declare <8 x double> @llvm.sin.v8f64(<8 x double>) -declare <16 x double> @llvm.sin.v16f64(<16 x double>) -declare @llvm.sin.nvx1f64() -declare @llvm.sin.nvx2f64() -declare @llvm.sin.nvx4f64() -declare @llvm.sin.nvx8f64() - -declare float @llvm.cos.f32(float) -declare <2 x float> @llvm.cos.v2f32(<2 x float>) -declare <4 x float> @llvm.cos.v4f32(<4 x float>) -declare <8 x float> @llvm.cos.v8f32(<8 x float>) -declare <16 x float> @llvm.cos.v16f32(<16 x float>) -declare @llvm.cos.nvx1f32() -declare @llvm.cos.nvx2f32() -declare @llvm.cos.nvx4f32() -declare @llvm.cos.nvx8f32() -declare @llvm.cos.nvx16f32() -declare double @llvm.cos.f64(double) -declare <2 x double> @llvm.cos.v2f64(<2 x double>) -declare <4 x double> @llvm.cos.v4f64(<4 x double>) -declare <8 x double> @llvm.cos.v8f64(<8 x double>) -declare <16 x double> @llvm.cos.v16f64(<16 x double>) -declare @llvm.cos.nvx1f64() -declare @llvm.cos.nvx2f64() -declare @llvm.cos.nvx4f64() -declare @llvm.cos.nvx8f64() - -declare float @llvm.exp.f32(float) -declare <2 x float> @llvm.exp.v2f32(<2 x float>) -declare <4 x float> @llvm.exp.v4f32(<4 x float>) -declare <8 x float> @llvm.exp.v8f32(<8 x float>) -declare <16 x float> @llvm.exp.v16f32(<16 x float>) -declare @llvm.exp.nvx1f32() -declare @llvm.exp.nvx2f32() -declare @llvm.exp.nvx4f32() -declare @llvm.exp.nvx8f32() -declare @llvm.exp.nvx16f32() -declare double @llvm.exp.f64(double) -declare <2 x double> @llvm.exp.v2f64(<2 x double>) -declare <4 x double> @llvm.exp.v4f64(<4 x double>) -declare <8 x double> @llvm.exp.v8f64(<8 x double>) -declare <16 x double> @llvm.exp.v16f64(<16 x double>) -declare @llvm.exp.nvx1f64() -declare @llvm.exp.nvx2f64() -declare @llvm.exp.nvx4f64() -declare @llvm.exp.nvx8f64() - -declare float @llvm.exp2.f32(float) -declare <2 x float> @llvm.exp2.v2f32(<2 x float>) -declare <4 x float> @llvm.exp2.v4f32(<4 x float>) -declare <8 x float> @llvm.exp2.v8f32(<8 x float>) -declare <16 x float> @llvm.exp2.v16f32(<16 x float>) -declare @llvm.exp2.nvx1f32() -declare @llvm.exp2.nvx2f32() -declare @llvm.exp2.nvx4f32() -declare @llvm.exp2.nvx8f32() -declare @llvm.exp2.nvx16f32() -declare double @llvm.exp2.f64(double) -declare <2 x double> @llvm.exp2.v2f64(<2 x double>) -declare <4 x double> @llvm.exp2.v4f64(<4 x double>) -declare <8 x double> @llvm.exp2.v8f64(<8 x double>) -declare <16 x double> @llvm.exp2.v16f64(<16 x double>) -declare @llvm.exp2.nvx1f64() -declare @llvm.exp2.nvx2f64() -declare @llvm.exp2.nvx4f64() -declare @llvm.exp2.nvx8f64() - -declare float @llvm.log.f32(float) -declare <2 x float> @llvm.log.v2f32(<2 x float>) -declare <4 x float> @llvm.log.v4f32(<4 x float>) -declare <8 x float> @llvm.log.v8f32(<8 x float>) -declare <16 x float> @llvm.log.v16f32(<16 x float>) -declare @llvm.log.nvx1f32() -declare @llvm.log.nvx2f32() -declare @llvm.log.nvx4f32() -declare @llvm.log.nvx8f32() -declare @llvm.log.nvx16f32() -declare double @llvm.log.f64(double) -declare <2 x double> @llvm.log.v2f64(<2 x double>) -declare <4 x double> @llvm.log.v4f64(<4 x double>) -declare <8 x double> @llvm.log.v8f64(<8 x double>) -declare <16 x double> @llvm.log.v16f64(<16 x double>) -declare @llvm.log.nvx1f64() -declare @llvm.log.nvx2f64() -declare @llvm.log.nvx4f64() -declare @llvm.log.nvx8f64() - -declare float @llvm.log10.f32(float) -declare <2 x float> @llvm.log10.v2f32(<2 x float>) -declare <4 x float> @llvm.log10.v4f32(<4 x float>) -declare <8 x float> @llvm.log10.v8f32(<8 x float>) -declare <16 x float> @llvm.log10.v16f32(<16 x float>) -declare @llvm.log10.nvx1f32() -declare @llvm.log10.nvx2f32() -declare @llvm.log10.nvx4f32() -declare @llvm.log10.nvx8f32() -declare @llvm.log10.nvx16f32() -declare double @llvm.log10.f64(double) -declare <2 x double> @llvm.log10.v2f64(<2 x double>) -declare <4 x double> @llvm.log10.v4f64(<4 x double>) -declare <8 x double> @llvm.log10.v8f64(<8 x double>) -declare <16 x double> @llvm.log10.v16f64(<16 x double>) -declare @llvm.log10.nvx1f64() -declare @llvm.log10.nvx2f64() -declare @llvm.log10.nvx4f64() -declare @llvm.log10.nvx8f64() +define void @log2_f16() { +; ZVFH-LABEL: 'log2_f16' +; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log2.f16(half undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log2.nxv1f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log2.nxv2f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log2.nxv4f16( undef) +; ZVFH-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log2.nxv8f16( undef) +; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; ZVFHMIN-LABEL: 'log2_f16' +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log2.f16(half undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.log2.nxv1f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.log2.nxv2f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.log2.nxv4f16( undef) +; ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.log2.nxv8f16( undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call half @llvm.log2.f16(half undef) + call <2 x half> @llvm.log2.v2f16(<2 x half> undef) + call <4 x half> @llvm.log2.v4f16(<4 x half> undef) + call <8 x half> @llvm.log2.v8f16(<8 x half> undef) + call <16 x half> @llvm.log2.v16f16(<16 x half> undef) + call @llvm.log2.nvx1f16( undef) + call @llvm.log2.nvx2f16( undef) + call @llvm.log2.nvx4f16( undef) + call @llvm.log2.nvx8f16( undef) + ret void +} -declare float @llvm.log2.f32(float) -declare <2 x float> @llvm.log2.v2f32(<2 x float>) -declare <4 x float> @llvm.log2.v4f32(<4 x float>) -declare <8 x float> @llvm.log2.v8f32(<8 x float>) -declare <16 x float> @llvm.log2.v16f32(<16 x float>) -declare @llvm.log2.nvx1f32() -declare @llvm.log2.nvx2f32() -declare @llvm.log2.nvx4f32() -declare @llvm.log2.nvx8f32() -declare @llvm.log2.nvx16f32() -declare double @llvm.log2.f64(double) -declare <2 x double> @llvm.log2.v2f64(<2 x double>) -declare <4 x double> @llvm.log2.v4f64(<4 x double>) -declare <8 x double> @llvm.log2.v8f64(<8 x double>) -declare <16 x double> @llvm.log2.v16f64(<16 x double>) -declare @llvm.log2.nvx1f64() -declare @llvm.log2.nvx2f64() -declare @llvm.log2.nvx4f64() -declare @llvm.log2.nvx8f64() diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll index 4a6e719fef3d6f9ac23f4c23018d91927d6b7ead..a50b3bfed62072505f8edb7c8ca3fd093b16fc5f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll +++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll @@ -6,11 +6,11 @@ define void @reduce_fmul_bfloat() { ; FP-REDUCE-LABEL: 'reduce_fmul_bfloat' ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 351 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll index fdccb0f64527048aa47d042d611891474c39c3eb..512cbf6334c1192477a20978faa03db7bcf48787 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll @@ -414,4 +414,184 @@ define void @select_of_constants() { ret void } +define void @vp_merge() { +; CHECK-LABEL: 'vp_merge' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = call <1 x i1> @llvm.vp.merge.v1i1(<1 x i1> undef, <1 x i1> undef, <1 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x i1> @llvm.vp.merge.v2i1(<2 x i1> undef, <2 x i1> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x i1> @llvm.vp.merge.v4i1(<4 x i1> undef, <4 x i1> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> undef, <8 x i1> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> undef, <16 x i1> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %6 = call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> undef, <32 x i1> undef, <32 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call @llvm.vp.merge.nxv1i1( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call @llvm.vp.merge.nxv2i1( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call @llvm.vp.merge.nxv4i1( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call @llvm.vp.merge.nxv8i1( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call @llvm.vp.merge.nxv16i1( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = call @llvm.vp.merge.nxv32i1( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <1 x i8> @llvm.vp.merge.v1i8(<1 x i1> undef, <1 x i8> undef, <1 x i8> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = call <2 x i8> @llvm.vp.merge.v2i8(<2 x i1> undef, <2 x i8> undef, <2 x i8> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> undef, <4 x i8> undef, <4 x i8> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> undef, <8 x i8> undef, <8 x i8> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call <16 x i8> @llvm.vp.merge.v16i8(<16 x i1> undef, <16 x i8> undef, <16 x i8> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call <32 x i8> @llvm.vp.merge.v32i8(<32 x i1> undef, <32 x i8> undef, <32 x i8> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call @llvm.vp.merge.nxv1i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %20 = call @llvm.vp.merge.nxv2i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = call @llvm.vp.merge.nxv4i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = call @llvm.vp.merge.nxv8i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = call @llvm.vp.merge.nxv16i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %24 = call @llvm.vp.merge.nxv32i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %25 = call <1 x i16> @llvm.vp.merge.v1i16(<1 x i1> undef, <1 x i16> undef, <1 x i16> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> undef, <2 x i16> undef, <2 x i16> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> undef, <4 x i16> undef, <4 x i16> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i16> @llvm.vp.merge.v8i16(<8 x i1> undef, <8 x i16> undef, <8 x i16> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call <16 x i16> @llvm.vp.merge.v16i16(<16 x i1> undef, <16 x i16> undef, <16 x i16> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = call <32 x i16> @llvm.vp.merge.v32i16(<32 x i1> undef, <32 x i16> undef, <32 x i16> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %31 = call @llvm.vp.merge.nxv1i16( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %32 = call @llvm.vp.merge.nxv2i16( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = call @llvm.vp.merge.nxv4i16( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = call @llvm.vp.merge.nxv8i16( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = call @llvm.vp.merge.nxv16i16( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %36 = call @llvm.vp.merge.nxv32i16( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %37 = call <1 x i32> @llvm.vp.merge.v1i32(<1 x i1> undef, <1 x i32> undef, <1 x i32> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %38 = call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> undef, <2 x i32> undef, <2 x i32> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %39 = call <4 x i32> @llvm.vp.merge.v4i32(<4 x i1> undef, <4 x i32> undef, <4 x i32> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %40 = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> undef, <8 x i32> undef, <8 x i32> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %41 = call <16 x i32> @llvm.vp.merge.v16i32(<16 x i1> undef, <16 x i32> undef, <16 x i32> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %42 = call <32 x i32> @llvm.vp.merge.v32i32(<32 x i1> undef, <32 x i32> undef, <32 x i32> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %43 = call @llvm.vp.merge.nxv1i32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %44 = call @llvm.vp.merge.nxv2i32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %45 = call @llvm.vp.merge.nxv4i32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %46 = call @llvm.vp.merge.nxv8i32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %47 = call @llvm.vp.merge.nxv16i32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %48 = call @llvm.vp.merge.nxv32i32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %49 = call <1 x i64> @llvm.vp.merge.v1i64(<1 x i1> undef, <1 x i64> undef, <1 x i64> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %50 = call <2 x i64> @llvm.vp.merge.v2i64(<2 x i1> undef, <2 x i64> undef, <2 x i64> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %51 = call <4 x i64> @llvm.vp.merge.v4i64(<4 x i1> undef, <4 x i64> undef, <4 x i64> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %52 = call <8 x i64> @llvm.vp.merge.v8i64(<8 x i1> undef, <8 x i64> undef, <8 x i64> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %53 = call <16 x i64> @llvm.vp.merge.v16i64(<16 x i1> undef, <16 x i64> undef, <16 x i64> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %54 = call <32 x i64> @llvm.vp.merge.v32i64(<32 x i1> undef, <32 x i64> undef, <32 x i64> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %55 = call @llvm.vp.merge.nxv1i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %56 = call @llvm.vp.merge.nxv2i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %57 = call @llvm.vp.merge.nxv4i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %58 = call @llvm.vp.merge.nxv8i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %59 = call @llvm.vp.merge.nxv16i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %60 = call @llvm.vp.merge.nxv32i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %61 = call <1 x float> @llvm.vp.merge.v1f32(<1 x i1> undef, <1 x float> undef, <1 x float> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %62 = call <2 x float> @llvm.vp.merge.v2f32(<2 x i1> undef, <2 x float> undef, <2 x float> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %63 = call <4 x float> @llvm.vp.merge.v4f32(<4 x i1> undef, <4 x float> undef, <4 x float> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %64 = call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> undef, <8 x float> undef, <8 x float> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %65 = call <16 x float> @llvm.vp.merge.v16f32(<16 x i1> undef, <16 x float> undef, <16 x float> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %66 = call <32 x float> @llvm.vp.merge.v32f32(<32 x i1> undef, <32 x float> undef, <32 x float> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %67 = call @llvm.vp.merge.nxv1f32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %68 = call @llvm.vp.merge.nxv2f32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %69 = call @llvm.vp.merge.nxv4f32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %70 = call @llvm.vp.merge.nxv8f32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %71 = call @llvm.vp.merge.nxv16f32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %72 = call @llvm.vp.merge.nxv32f32( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %73 = call <1 x double> @llvm.vp.merge.v1f64(<1 x i1> undef, <1 x double> undef, <1 x double> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %74 = call <2 x double> @llvm.vp.merge.v2f64(<2 x i1> undef, <2 x double> undef, <2 x double> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %75 = call <4 x double> @llvm.vp.merge.v4f64(<4 x i1> undef, <4 x double> undef, <4 x double> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %76 = call <8 x double> @llvm.vp.merge.v8f64(<8 x i1> undef, <8 x double> undef, <8 x double> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %77 = call <16 x double> @llvm.vp.merge.v16f64(<16 x i1> undef, <16 x double> undef, <16 x double> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %78 = call <32 x double> @llvm.vp.merge.v32f64(<32 x i1> undef, <32 x double> undef, <32 x double> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %79 = call @llvm.vp.merge.nxv1f64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %80 = call @llvm.vp.merge.nxv2f64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %81 = call @llvm.vp.merge.nxv4f64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %82 = call @llvm.vp.merge.nxv8f64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %83 = call @llvm.vp.merge.nxv16f64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %84 = call @llvm.vp.merge.nxv32f64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call <1 x i1> @llvm.vp.merge.v1i1(<1 x i1> undef, <1 x i1> undef, <1 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.merge.v2i1(<2 x i1> undef, <2 x i1> undef, <2 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.merge.v4i1(<4 x i1> undef, <4 x i1> undef, <4 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> undef, <8 x i1> undef, <8 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> undef, <16 x i1> undef, <16 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> undef, <32 x i1> undef, <32 x i1> undef, i32 undef) + call @llvm.vp.merge.nxv1i1( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv2i1( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv4i1( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv8i1( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv16i1( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv32i1( undef, undef, undef, i32 undef) + + call <1 x i8> @llvm.vp.merge.v1i8(<1 x i1> undef, <1 x i8> undef, <1 x i8> undef, i32 undef) + call <2 x i8> @llvm.vp.merge.v2i8(<2 x i1> undef, <2 x i8> undef, <2 x i8> undef, i32 undef) + call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> undef, <4 x i8> undef, <4 x i8> undef, i32 undef) + call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> undef, <8 x i8> undef, <8 x i8> undef, i32 undef) + call <16 x i8> @llvm.vp.merge.v16i8(<16 x i1> undef, <16 x i8> undef, <16 x i8> undef, i32 undef) + call <32 x i8> @llvm.vp.merge.v32i8(<32 x i1> undef, <32 x i8> undef, <32 x i8> undef, i32 undef) + call @llvm.vp.merge.nxv1i8( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv2i8( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv4i8( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv8i8( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv16i8( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv32i8( undef, undef, undef, i32 undef) + + call <1 x i16> @llvm.vp.merge.v1i16(<1 x i1> undef, <1 x i16> undef, <1 x i16> undef, i32 undef) + call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> undef, <2 x i16> undef, <2 x i16> undef, i32 undef) + call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> undef, <4 x i16> undef, <4 x i16> undef, i32 undef) + call <8 x i16> @llvm.vp.merge.v8i16(<8 x i1> undef, <8 x i16> undef, <8 x i16> undef, i32 undef) + call <16 x i16> @llvm.vp.merge.v16i16(<16 x i1> undef, <16 x i16> undef, <16 x i16> undef, i32 undef) + call <32 x i16> @llvm.vp.merge.v32i16(<32 x i1> undef, <32 x i16> undef, <32 x i16> undef, i32 undef) + call @llvm.vp.merge.nxv1i16( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv2i16( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv4i16( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv8i16( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv16i16( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv32i16( undef, undef, undef, i32 undef) + + call <1 x i32> @llvm.vp.merge.v1i32(<1 x i1> undef, <1 x i32> undef, <1 x i32> undef, i32 undef) + call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> undef, <2 x i32> undef, <2 x i32> undef, i32 undef) + call <4 x i32> @llvm.vp.merge.v4i32(<4 x i1> undef, <4 x i32> undef, <4 x i32> undef, i32 undef) + call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> undef, <8 x i32> undef, <8 x i32> undef, i32 undef) + call <16 x i32> @llvm.vp.merge.v16i32(<16 x i1> undef, <16 x i32> undef, <16 x i32> undef, i32 undef) + call <32 x i32> @llvm.vp.merge.v32i32(<32 x i1> undef, <32 x i32> undef, <32 x i32> undef, i32 undef) + call @llvm.vp.merge.nxv1i32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv2i32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv4i32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv8i32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv16i32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv32i32( undef, undef, undef, i32 undef) + call <1 x i64> @llvm.vp.merge.v1i64(<1 x i1> undef, <1 x i64> undef, <1 x i64> undef, i32 undef) + call <2 x i64> @llvm.vp.merge.v2i64(<2 x i1> undef, <2 x i64> undef, <2 x i64> undef, i32 undef) + call <4 x i64> @llvm.vp.merge.v4i64(<4 x i1> undef, <4 x i64> undef, <4 x i64> undef, i32 undef) + call <8 x i64> @llvm.vp.merge.v8i64(<8 x i1> undef, <8 x i64> undef, <8 x i64> undef, i32 undef) + call <16 x i64> @llvm.vp.merge.v16i64(<16 x i1> undef, <16 x i64> undef, <16 x i64> undef, i32 undef) + call <32 x i64> @llvm.vp.merge.v32i64(<32 x i1> undef, <32 x i64> undef, <32 x i64> undef, i32 undef) + call @llvm.vp.merge.nxv1i64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv2i64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv4i64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv8i64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv16i64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv32i64( undef, undef, undef, i32 undef) + + call <1 x float> @llvm.vp.merge.v1f32(<1 x i1> undef, <1 x float> undef, <1 x float> undef, i32 undef) + call <2 x float> @llvm.vp.merge.v2f32(<2 x i1> undef, <2 x float> undef, <2 x float> undef, i32 undef) + call <4 x float> @llvm.vp.merge.v4f32(<4 x i1> undef, <4 x float> undef, <4 x float> undef, i32 undef) + call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> undef, <8 x float> undef, <8 x float> undef, i32 undef) + call <16 x float> @llvm.vp.merge.v16f32(<16 x i1> undef, <16 x float> undef, <16 x float> undef, i32 undef) + call <32 x float> @llvm.vp.merge.v32f32(<32 x i1> undef, <32 x float> undef, <32 x float> undef, i32 undef) + call @llvm.vp.merge.nxv1f32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv2f32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv4f32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv8f32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv16f32( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv32f32( undef, undef, undef, i32 undef) + + call <1 x double> @llvm.vp.merge.v1f64(<1 x i1> undef, <1 x double> undef, <1 x double> undef, i32 undef) + call <2 x double> @llvm.vp.merge.v2f64(<2 x i1> undef, <2 x double> undef, <2 x double> undef, i32 undef) + call <4 x double> @llvm.vp.merge.v4f64(<4 x i1> undef, <4 x double> undef, <4 x double> undef, i32 undef) + call <8 x double> @llvm.vp.merge.v8f64(<8 x i1> undef, <8 x double> undef, <8 x double> undef, i32 undef) + call <16 x double> @llvm.vp.merge.v16f64(<16 x i1> undef, <16 x double> undef, <16 x double> undef, i32 undef) + call <32 x double> @llvm.vp.merge.v32f64(<32 x i1> undef, <32 x double> undef, <32 x double> undef, i32 undef) + call @llvm.vp.merge.nxv1f64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv2f64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv4f64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv8f64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv16f64( undef, undef, undef, i32 undef) + call @llvm.vp.merge.nxv32f64( undef, undef, undef, i32 undef) + + ret void +} diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll index 8d7d1576a532dabffa22c7a3c3c79f37617aac9b..ddfaa8c13d425f8866156cf19636b9f80f13b5bf 100644 --- a/llvm/test/Analysis/CostModel/RISCV/splice.ll +++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE @@ -34,6 +34,13 @@ define void @vector_splice() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call @llvm.vector.splice.nxv16i64( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call @llvm.vector.splice.nxv32i64( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call @llvm.vector.splice.nxv64i64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1bf16 = call @llvm.vector.splice.nxv1bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2bf16 = call @llvm.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4bf16 = call @llvm.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8bf16 = call @llvm.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16bf16 = call @llvm.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32bf16 = call @llvm.vector.splice.nxv32bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64bf16 = call @llvm.vector.splice.nxv64bf16( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call @llvm.vector.splice.nxv1f16( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call @llvm.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call @llvm.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) @@ -86,6 +93,13 @@ define void @vector_splice() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call @llvm.vector.splice.nxv16i64( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call @llvm.vector.splice.nxv32i64( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call @llvm.vector.splice.nxv64i64( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv1bf16 = call @llvm.vector.splice.nxv1bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv2bf16 = call @llvm.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv4bf16 = call @llvm.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv8bf16 = call @llvm.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv16bf16 = call @llvm.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv32bf16 = call @llvm.vector.splice.nxv32bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv64bf16 = call @llvm.vector.splice.nxv64bf16( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call @llvm.vector.splice.nxv1f16( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call @llvm.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call @llvm.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) @@ -141,6 +155,14 @@ define void @vector_splice() { %splice.nxv32i64 = call @llvm.vector.splice.nxv32i64( zeroinitializer, zeroinitializer, i32 -1) %splice.nxv64i64 = call @llvm.vector.splice.nxv64i64( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv1bf16 = call @llvm.vector.splice.nxv1bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv2bf16 = call @llvm.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv4bf16 = call @llvm.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv8bf16 = call @llvm.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv16bf16 = call @llvm.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv32bf16 = call @llvm.vector.splice.nxv32bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv64bf16 = call @llvm.vector.splice.nxv64bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv1f16 = call @llvm.vector.splice.nxv1f16( zeroinitializer, zeroinitializer, i32 -1) %splice.nxv2f16 = call @llvm.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) %splice.nxv4f16 = call @llvm.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) diff --git a/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll b/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll index 410320665e369907f8b06aaa97e42cf14bcca6fc..97f6bf927be2137466239f3e18c49ee3320b0373 100644 --- a/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll +++ b/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll @@ -8,7 +8,7 @@ define void @test() #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV]], 39 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 39 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll index da6507f051766cc13d9fe4bb4303d6b825945c6b..5b9a44710763e4a823a2a1a2b19cad65e955f1eb 100644 --- a/llvm/test/Assembler/fp-intrinsics-attr.ll +++ b/llvm/test/Assembler/fp-intrinsics-attr.ll @@ -105,6 +105,11 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp { metadata !"round.dynamic", metadata !"fpexcept.strict") + %atan2 = call double @llvm.experimental.constrained.atan2.f64( + double %a, double %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + %cosh = call double @llvm.experimental.constrained.cosh.f64( double %a, metadata !"round.dynamic", @@ -291,6 +296,9 @@ declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadat declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata) ; CHECK: @llvm.experimental.constrained.atan.f64({{.*}}) #[[ATTR1]] +declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata) +; CHECK: @llvm.experimental.constrained.atan2.f64({{.*}}) #[[ATTR1]] + declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata) ; CHECK: @llvm.experimental.constrained.sinh.f64({{.*}}) #[[ATTR1]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index a21b786a2bae9744bba80681809a2c75cb705e13..146d1177f469a4102e86644a67e0220831c30097 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -152,12 +152,12 @@ # # DEBUG-NEXT: G_INTRINSIC_TRUNC (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_INTRINSIC_ROUND (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK @@ -167,8 +167,8 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined @@ -205,34 +205,34 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_ATOMIC_CMPXCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ATOMICRMW_XCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ATOMICRMW_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ATOMICRMW_SUB (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ATOMICRMW_AND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ATOMICRMW_NAND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_ATOMICRMW_OR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ATOMICRMW_XOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ATOMICRMW_MAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected @@ -310,8 +310,8 @@ # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FCONSTANT (opcode {{[0-9]+}}): 1 type index, 0 imm indices -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_VASTART (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK @@ -459,27 +459,27 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FSUB (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FMA (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FMAD (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK @@ -565,12 +565,12 @@ # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FMINNUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FMAXNUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FMINNUM_IEEE (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined @@ -579,12 +579,12 @@ # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FMINIMUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FMAXIMUM (opcode {{[0-9]+}}): 1 type index # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_GET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected @@ -692,8 +692,8 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FCOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK @@ -734,20 +734,20 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FRINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_FNEARBYINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected -# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_ADDRSPACE_CAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined diff --git a/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll new file mode 100644 index 0000000000000000000000000000000000000000..30ce0cb09fc084c8df9c1838e0df7c49247d92d3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; FIXME: Currently, we avoid narrowing this v4i32 load, in the +; hopes of being able to fold the shift, despite it requiring stack +; storage + loads. Ideally, we should narrow here and load the i32 +; directly from the variable offset e.g: +; +; add x8, x0, x1, lsl #4 +; and x9, x2, #0x3 +; ldr w0, [x8, x9, lsl #2] +; +; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should +; probably be updated to choose load-narrowing instead of folding the +; lsl in larger vector cases. +; +define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) { +; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, x1, lsl #4] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: bfi x8, x2, #2, #2 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off + %x = load <4 x i32>, ptr %idx, align 8 + %res = extractelement <4 x i32> %x, i32 %ele + ret i32 %res +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll index 4ca2fb8815797f02275713f742cfe168ccc2b792..068e194779c153df5a59eebc0f5e6d56f6ca232e 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll @@ -1,84 +1,121 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve2p1 -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-linux-gnu" -; Function Attrs: nounwind readnone -; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1 -; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0 -; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]] -; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]] -; CHECK: INLINEASM {{.*}} [[ARG4]] define @test_svadd_i8( %Zn, %Zm) { + ; CHECK-LABEL: name: test_svadd_i8 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $z0, $z1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"add $0.b, $1.b, $2.b", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]] + ; CHECK-NEXT: $z0 = COPY %2 + ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "add $0.b, $1.b, $2.b", "=w,w,y"( %Zn, %Zm) ret %1 } -; Function Attrs: nounwind readnone -; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1 -; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0 -; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]] -; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]] -; CHECK: INLINEASM {{.*}} [[ARG4]] define @test_svsub_i64( %Zn, %Zm) { + ; CHECK-LABEL: name: test_svsub_i64 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $z0, $z1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"sub $0.d, $1.d, $2.d", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]] + ; CHECK-NEXT: $z0 = COPY %2 + ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "sub $0.d, $1.d, $2.d", "=w,w,x"( %Zn, %Zm) ret %1 } -; Function Attrs: nounwind readnone -; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1 -; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0 -; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]] -; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]] -; CHECK: INLINEASM {{.*}} [[ARG4]] define @test_svfmul_f16( %Zn, %Zm) { + ; CHECK-LABEL: name: test_svfmul_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $z0, $z1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"fmul $0.h, $1.h, $2.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]] + ; CHECK-NEXT: $z0 = COPY %2 + ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "fmul $0.h, $1.h, $2.h", "=w,w,y"( %Zn, %Zm) ret %1 } -; Function Attrs: nounwind readnone -; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1 -; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0 -; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]] -; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]] -; CHECK: INLINEASM {{.*}} [[ARG4]] define @test_svfmul_f( %Zn, %Zm) { + ; CHECK-LABEL: name: test_svfmul_f + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $z0, $z1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"fmul $0.s, $1.s, $2.s", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]] + ; CHECK-NEXT: $z0 = COPY %2 + ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "fmul $0.s, $1.s, $2.s", "=w,w,x"( %Zn, %Zm) ret %1 } -; Function Attrs: nounwind readnone -; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1 -; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0 -; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0 -; CHECK: [[ARG4:%[0-9]+]]:ppr_3b = COPY [[ARG3]] -; CHECK: INLINEASM {{.*}} [[ARG4]] define @test_svfadd_f16( %Pg, %Zn, %Zm) { + ; CHECK-LABEL: name: test_svfadd_f16 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $p0, $z0, $z1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY $p0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr_3b = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 589833 /* reguse:PPR_3b */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]] + ; CHECK-NEXT: $z0 = COPY %3 + ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Upl,w,w"( %Pg, %Zn, %Zm) ret %1 } -; Function Attrs: nounwind readnone -; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z0 -; CHECK: [[ARG2:%[0-9]+]]:ppr = COPY $p0 -; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY [[ARG2]] -; CHECK: [[ARG4:%[0-9]+]]:zpr = COPY [[ARG1]] -; CHECK: INLINEASM {{.*}} [[ARG3]] define @test_incp( %Pg, %Zn) { + ; CHECK-LABEL: name: test_incp + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $p0, $z0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ppr = COPY $p0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 393225 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: $z0 = COPY %2 + ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "incp $0.s, $1", "=w,@3Upa,0"( %Pg, %Zn) ret %1 } -; Function Attrs: nounwind readnone -; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1 -; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0 -; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0 -; CHECK: [[ARG4:%[0-9]+]]:ppr_p8to15 = COPY [[ARG3]] -; CHECK: INLINEASM {{.*}} [[ARG4]] define @test_svfadd_f16_Uph_constraint( %Pg, %Zn, %Zm) { + ; CHECK-LABEL: name: test_svfadd_f16_Uph_constraint + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $p0, $z0, $z1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY $p0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr_p8to15 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]] + ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 655369 /* reguse:PPR_p8to15 */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]] + ; CHECK-NEXT: $z0 = COPY %3 + ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"( %Pg, %Zn, %Zm) ret %1 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir index 425a23214871d68a8128570bf69371139d015931..ab5e320725d5bd8d5261984bdfc0aa11fbb036df 100644 --- a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir +++ b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -run-pass=early-ifcvt -o - %s | FileCheck %s +# RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -passes=early-ifcvt -o - %s | FileCheck %s --- | define void @test_cond_is_load_with_invariant_ops() { diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir index 318bdceeaef41dadab8ffe004c6b196bf394b5f2..a7f67f8b682c3c2ef7623fd18343e88d8105f213 100644 --- a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir +++ b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-unknown-unknown -passes=early-ifcvt -verify-each %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios13.3.0" diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir index b9298608e192fc442b0338601af344e85ab66158..16d5dfc78f564b156a2479f4eed696c849083f7c 100644 --- a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir +++ b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=aarch64-- -run-pass=early-ifcvt -stress-early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-- -passes=early-ifcvt -stress-early-ifcvt %s -o - | FileCheck %s --- name: fmov0 diff --git a/llvm/test/CodeGen/AArch64/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AArch64/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..fb559867a2d47b8c252b01b19853e611c2aabb66 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/naked-fn-with-frame-pointer.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple aarch64 | FileCheck %s -check-prefixes=CHECK-LE +; RUN: llc < %s -mtriple aarch64_be | FileCheck %s -check-prefixes=CHECK-BE + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LE-LABEL: naked: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: bl main +; +; CHECK-BE-LABEL: naked: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: bl main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LE-LABEL: normal: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-LE-NEXT: mov x29, sp +; CHECK-LE-NEXT: .cfi_def_cfa w29, 16 +; CHECK-LE-NEXT: .cfi_offset w30, -8 +; CHECK-LE-NEXT: .cfi_offset w29, -16 +; CHECK-LE-NEXT: bl main +; +; CHECK-BE-LABEL: normal: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-BE-NEXT: mov x29, sp +; CHECK-BE-NEXT: .cfi_def_cfa w29, 16 +; CHECK-BE-NEXT: .cfi_offset w30, -8 +; CHECK-BE-NEXT: .cfi_offset w29, -16 +; CHECK-BE-NEXT: bl main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-type-info-vptr-discr.ll b/llvm/test/CodeGen/AArch64/ptrauth-type-info-vptr-discr.ll new file mode 100644 index 0000000000000000000000000000000000000000..fbd777911aecb20d945205690cc7381b995ac462 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-type-info-vptr-discr.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple aarch64-linux-gnu -mattr=+pauth -filetype=asm -o - %s | FileCheck --check-prefix=ELF %s +; RUN: llc -mtriple aarch64-apple-darwin -mattr=+pauth -filetype=asm -o - %s | FileCheck --check-prefix=MACHO %s + +; ELF-LABEL: _ZTI10Disc: +; ELF-NEXT: .xword (_ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546,addr) +; ELF-LABEL: _ZTI10NoDisc: +; ELF-NEXT: .xword (_ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546) + +; MACHO-LABEL: __ZTI10Disc: +; MACHO-NEXT: .quad (__ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546,addr) +; MACHO-LABEL: __ZTI10NoDisc: +; MACHO-NEXT: .quad (__ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546) + + +@_ZTI10Disc = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 45546, ptr @_ZTI10Disc), ptr @_ZTS10Disc }, align 8 +@_ZTS10Disc = constant [4 x i8] c"Disc", align 1 + +@_ZTI10NoDisc = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 45546), ptr @_ZTS10NoDisc }, align 8 +@_ZTS10NoDisc = constant [6 x i8] c"NoDisc", align 1 + +@_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll new file mode 100644 index 0000000000000000000000000000000000000000..9aadf3133ba19706004d136d9b559856f436ee4d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -force-streaming-compatible < %s | FileCheck %s +; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING + +target triple = "aarch64-unknown-linux-gnu" + +define double @t1(double %x) { +; CHECK-LABEL: t1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: scvtf d0, x8 +; CHECK-NEXT: ret +; +; NON-STREAMING-LABEL: t1: +; NON-STREAMING: // %bb.0: // %entry +; NON-STREAMING-NEXT: fcvtzs d0, d0 +; NON-STREAMING-NEXT: scvtf d0, d0 +; NON-STREAMING-NEXT: ret +entry: + %conv = fptosi double %x to i64 + %conv1 = sitofp i64 %conv to double + ret double %conv1 +} + +define float @t2(float %x) { +; CHECK-LABEL: t2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w8, s0 +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ret +; +; NON-STREAMING-LABEL: t2: +; NON-STREAMING: // %bb.0: // %entry +; NON-STREAMING-NEXT: fcvtzs s0, s0 +; NON-STREAMING-NEXT: scvtf s0, s0 +; NON-STREAMING-NEXT: ret +entry: + %conv = fptosi float %x to i32 + %conv1 = sitofp i32 %conv to float + ret float %conv1 +} + +define half @t3(half %x) { +; CHECK-LABEL: t3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvtzs w8, s0 +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +; +; NON-STREAMING-LABEL: t3: +; NON-STREAMING: // %bb.0: // %entry +; NON-STREAMING-NEXT: fcvt s0, h0 +; NON-STREAMING-NEXT: fcvtzs s0, s0 +; NON-STREAMING-NEXT: scvtf s0, s0 +; NON-STREAMING-NEXT: fcvt h0, s0 +; NON-STREAMING-NEXT: ret +entry: + %conv = fptosi half %x to i32 + %conv1 = sitofp i32 %conv to half + ret half %conv1 +} + +define double @t4(double %x) { +; CHECK-LABEL: t4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu x8, d0 +; CHECK-NEXT: ucvtf d0, x8 +; CHECK-NEXT: ret +; +; NON-STREAMING-LABEL: t4: +; NON-STREAMING: // %bb.0: // %entry +; NON-STREAMING-NEXT: fcvtzu d0, d0 +; NON-STREAMING-NEXT: ucvtf d0, d0 +; NON-STREAMING-NEXT: ret +entry: + %conv = fptoui double %x to i64 + %conv1 = uitofp i64 %conv to double + ret double %conv1 +} + +define float @t5(float %x) { +; CHECK-LABEL: t5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu w8, s0 +; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: ret +; +; NON-STREAMING-LABEL: t5: +; NON-STREAMING: // %bb.0: // %entry +; NON-STREAMING-NEXT: fcvtzu s0, s0 +; NON-STREAMING-NEXT: ucvtf s0, s0 +; NON-STREAMING-NEXT: ret +entry: + %conv = fptoui float %x to i32 + %conv1 = uitofp i32 %conv to float + ret float %conv1 +} + +define half @t6(half %x) { +; CHECK-LABEL: t6: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvtzu w8, s0 +; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +; +; NON-STREAMING-LABEL: t6: +; NON-STREAMING: // %bb.0: // %entry +; NON-STREAMING-NEXT: fcvt s0, h0 +; NON-STREAMING-NEXT: fcvtzu s0, s0 +; NON-STREAMING-NEXT: ucvtf s0, s0 +; NON-STREAMING-NEXT: fcvt h0, s0 +; NON-STREAMING-NEXT: ret +entry: + %conv = fptoui half %x to i32 + %conv1 = uitofp i32 %conv to half + ret half %conv1 +} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index afd3bb7161c155cecd6733114c0b51129100b83e..0c712a15d4de2f15da73051c73ef8d54bcd33f3d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -21,20 +21,20 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #14] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #12] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #10] -; NONEON-NOSVE-NEXT: ldr h0, [sp] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #8] ; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] @@ -58,36 +58,36 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #30] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #28] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #26] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #24] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #22] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #20] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #18] -; NONEON-NOSVE-NEXT: ldr h0, [sp] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #16] ; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] @@ -115,68 +115,68 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #62] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #60] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #58] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #56] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #54] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #52] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #50] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #48] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #46] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #44] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #42] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #40] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #38] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #36] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp] ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #34] -; NONEON-NOSVE-NEXT: ldr h0, [sp] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: str h0, [sp, #32] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] @@ -207,11 +207,11 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 -; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #8] ; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret @@ -234,15 +234,15 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) { ; NONEON-NOSVE-NEXT: sub sp, sp, #32 ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: str d0, [sp, #8] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] ; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: add sp, sp, #32 @@ -271,25 +271,25 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] @@ -328,47 +328,47 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] ; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] ; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] -; NONEON-NOSVE-NEXT: ucvtf s1, s0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ucvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] ; NONEON-NOSVE-NEXT: stp q2, q3, [x1] @@ -399,8 +399,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { ; NONEON-NOSVE-NEXT: sub sp, sp, #16 ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: str d0, [sp, #8] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: str d0, [sp] ; NONEON-NOSVE-NEXT: ldr d0, [sp], #16 ; NONEON-NOSVE-NEXT: ret @@ -424,11 +424,11 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) { ; NONEON-NOSVE-NEXT: sub sp, sp, #32 ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: str d0, [sp, #8] -; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 -; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16] ; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret @@ -464,15 +464,13 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] ; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] @@ -529,27 +527,23 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] ; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] ; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #72] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #68] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #64] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] ; NONEON-NOSVE-NEXT: stp q2, q3, [x1] @@ -649,49 +643,42 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] ; NONEON-NOSVE-NEXT: str d1, [sp, #328] ; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] -; NONEON-NOSVE-NEXT: str d0, [sp, #168] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #164] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #160] ; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #160] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #152] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #156] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #152] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #144] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #148] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #144] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #136] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #140] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #136] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #332] +; NONEON-NOSVE-NEXT: ucvtf d1, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #328] ; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #328] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #188] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #184] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #180] +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] ; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #176] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #172] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #168] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256] ; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] ; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256] @@ -1041,10 +1028,9 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) { ; NONEON-NOSVE-NEXT: sub sp, sp, #32 ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: str d0, [sp, #8] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] ; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: add sp, sp, #32 @@ -1073,15 +1059,13 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] @@ -1120,27 +1104,23 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] ; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] ; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] -; NONEON-NOSVE-NEXT: ucvtf d1, d0 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ucvtf d1, w9 +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] ; NONEON-NOSVE-NEXT: stp q2, q3, [x1] @@ -2984,8 +2964,8 @@ define half @ucvtf_i16_f16(ptr %0) { ; ; NONEON-NOSVE-LABEL: ucvtf_i16_f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ldrh w8, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 @@ -2996,14 +2976,14 @@ define half @ucvtf_i16_f16(ptr %0) { define float @ucvtf_i16_f32(ptr %0) { ; CHECK-LABEL: ucvtf_i16_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr h0, [x0] -; CHECK-NEXT: ucvtf s0, s0 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ucvtf s0, w8 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i16_f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: ldrh w8, [x0] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 ; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = uitofp i16 %2 to float @@ -3013,14 +2993,14 @@ define float @ucvtf_i16_f32(ptr %0) { define double @ucvtf_i16_f64(ptr %0) { ; CHECK-LABEL: ucvtf_i16_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr h0, [x0] -; CHECK-NEXT: ucvtf d0, d0 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ucvtf d0, w8 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i16_f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ldrh w8, [x0] +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: ret %2 = load i16, ptr %0, align 64 %3 = uitofp i16 %2 to double @@ -3065,14 +3045,14 @@ define float @ucvtf_i32_f32(ptr %0) { define double @ucvtf_i32_f64(ptr %0) { ; CHECK-LABEL: ucvtf_i32_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ucvtf d0, d0 +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ucvtf d0, w8 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_i32_f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: ucvtf d0, w8 ; NONEON-NOSVE-NEXT: ret %2 = load i32, ptr %0, align 64 %3 = uitofp i32 %2 to double diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll index 6dce6c1852af9b1722393fec09c9e4b4b73dca2d..6e4fb2678b382d92a0c70125774e36b401098d3f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll @@ -27,10 +27,8 @@ define hidden <2 x i64> @icmp_v2i32_zext_to_v2i64(<2 x i32> %arg) { ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; CHECK-NEXT: v_and_b32_e32 v2, 1, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %arg, zeroinitializer %sext = zext <2 x i1> %cmp to <2 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir index 11411c691c39015b8625a91fd8e14c687d7a54f4..1971cd80d5686bf5c80e39de54e2fd9b1969b249 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s # Note: 16-bit instructions generally produce a 0 result in the high 16-bits on GFX8 and GFX9 and preserve high 16 bits on GFX10+ @@ -23,6 +23,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] + ; ; GFX10-LABEL: name: add_s16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -30,6 +31,14 @@ body: | ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]] + ; + ; GFX11-LABEL: name: add_s16 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -56,6 +65,7 @@ body: | ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] + ; ; GFX10-LABEL: name: add_s16_zext_to_s32 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -65,6 +75,16 @@ body: | ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_e64_]], implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; + ; GFX11-LABEL: name: add_s16_zext_to_s32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_fake16_e64_]], implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -91,12 +111,20 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] + ; ; GFX10-LABEL: name: add_s16_neg_inline_const_64 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]] + ; + ; GFX11-LABEL: name: add_s16_neg_inline_const_64 + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_CONSTANT i16 -64 @@ -121,6 +149,7 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] + ; ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -129,6 +158,15 @@ body: | ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_e64_]], implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; + ; GFX11-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32 + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_fake16_e64_]], implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_CONSTANT i16 -64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir index 17cdab46c3b93625e8ee2fa8b8e3b4f74c4f9931..b5f91b6b860831dd5fcabad3874c9f12e70dbd64 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- name: fcmp_false_f16 @@ -10,15 +11,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_false_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -36,15 +49,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_true_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -62,13 +87,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] + ; GFX11-LABEL: name: fcmp_false_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0 @@ -84,13 +109,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] + ; GFX11-LABEL: name: fcmp_true_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15 @@ -106,15 +131,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] + ; GFX11-LABEL: name: fcmp_false_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 @@ -132,15 +157,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] + ; GFX11-LABEL: name: fcmp_true_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir index 158076a3b74a2aade5894a61df8e0999ff19989e..a67a0b6455fac937a85894fb60500c3b35c5e447 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- name: fcmp_false_f16 @@ -10,15 +11,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_false_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -36,15 +49,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_true_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -62,13 +87,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] + ; GFX11-LABEL: name: fcmp_false_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0 @@ -84,13 +109,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] + ; GFX11-LABEL: name: fcmp_true_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15 @@ -106,15 +131,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] + ; GFX11-LABEL: name: fcmp_false_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 @@ -132,15 +157,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] + ; GFX11-LABEL: name: fcmp_true_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir index 0ff633fb4d8bece43e4fbcf1a81afde987e41792..df2f390124ebd61cce0d06f141a3009746665f23 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s --- @@ -45,15 +45,15 @@ body: | ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]] ; - ; GFX11-LABEL: name: fceil_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: fceil_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: fceil_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 @@ -85,14 +85,14 @@ body: | ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]] ; - ; GFX11-LABEL: name: fceil_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: fceil_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: fceil_s16_vs ; GFX11-FAKE16: liveins: $sgpr0 @@ -124,15 +124,15 @@ body: | ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]] ; - ; GFX11-LABEL: name: fceil_fneg_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: fceil_fneg_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir index fc8a6aaa17512ba6289f8d2ef59d37f89eff7d7f..df62806b61918dd8844e3f2ed469f1bf42daaff2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s --- @@ -54,15 +54,15 @@ body: | ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]] ; - ; GFX11-LABEL: name: ffloor_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: ffloor_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: ffloor_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 @@ -94,14 +94,14 @@ body: | ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]] ; - ; GFX11-LABEL: name: ffloor_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: ffloor_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: ffloor_s16_vs ; GFX11-FAKE16: liveins: $sgpr0 @@ -133,15 +133,15 @@ body: | ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]] ; - ; GFX11-LABEL: name: ffloor_fneg_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: ffloor_fneg_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: ffloor_fneg_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir index 32a73bc4e24a5a9176235abb4865f372fd5397a4..03cb907f82a164c2f205948a6fc6939d1cf6ac77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 --- name: fptosi_s32_to_s32_vv @@ -135,13 +136,22 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s32_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -174,13 +184,21 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s32_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -217,15 +235,25 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s32_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 @@ -259,13 +287,23 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s1_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -299,13 +337,22 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s1_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -343,15 +390,26 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s1_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir index 47a091804ce0a65fd6e002536ab5a2b1935d117d..521a0e8a2a7964485fddb82e8b68991976f33b05 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 --- @@ -85,13 +86,22 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s32_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -124,13 +134,21 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s32_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -167,15 +185,25 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s32_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 @@ -209,13 +237,23 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s1_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -249,13 +287,22 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s1_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -293,15 +340,26 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s1_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir index 938bb58bafc9365fdfa524bec3d3424d9dfffcdd..3888ce87b46fd9cdf2853a3ff41f5c9f1f2fc748 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- @@ -85,13 +86,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: sitofp_s32_to_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_SITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 @@ -124,13 +135,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: sitofp_s32_to_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s16) = G_SITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir index 9c6fded0d142535b34abab9c6581f4009ef0c15d..35d622dc57d1814efd7c41cb9daa12ce80161e80 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- name: uitofp_s32_to_s32_vv @@ -99,13 +100,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: uitofp_s32_to_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_UITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 @@ -138,13 +149,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: uitofp_s32_to_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s16) = G_UITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll index 9d586e3e4a09a400a549d95b6c0ad305587bc215..eeb7b138fde31a218e3c2e297ec3bd35bc8b0ac4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX78,GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GFX78,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define i16 @v_powi_f16(i16 %l, i32 %r) { ; GFX7-LABEL: v_powi_f16: @@ -36,21 +37,37 @@ define i16 @v_powi_f16(i16 %l, i32 %r) { ; GFX8-NEXT: v_exp_f16_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_powi_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_log_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_powi_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_powi_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %l.cast = bitcast i16 %l to half %res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r) %res.cast = bitcast half %res to i16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir new file mode 100644 index 0000000000000000000000000000000000000000..f87a253dcb4333dff15975fdf03e156b874a84ff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir @@ -0,0 +1,28 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: replaceRegWith_requires_copy +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $vgpr0_vgpr1 + + ; CHECK-LABEL: name: replaceRegWith_requires_copy + ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sreg_32(s32) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY [[ICMP]](s32) + ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr(p1) = COPY $vgpr0_vgpr1 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(s32) = G_CONSTANT i32 1 + %3:sreg_32(s32) = G_ICMP intpred(ne), %1, %2 + %4:sgpr(s32) = G_AND %3, %2 + G_STORE %4(s32), %0(p1) :: (store (s32), addrspace 1) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll index a189ba9b103421eb72cfa992596f2169cb4525b3..0fa58f3c444a5401ce4b78389122bedcb4339f63 100644 --- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll +++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll @@ -1,48 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; SI-LABEL: {{^}}s_clear_msb: -; SI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_clear_msb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, 2147483647 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_set_msb: -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_set_msb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, 2147483648 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_clear_lsb: -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2 define amdgpu_kernel void @s_clear_lsb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_clear_lsb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, -2 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, 4294967294 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_set_lsb: -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_set_lsb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_b32 s4, s4, 1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, 1 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_clear_midbit: -; SI: s_bitset0_b32 s{{[0-9]+}}, 8 define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_clear_midbit: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s4, 8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, 4294967039 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_set_midbit: -; SI: s_bitset1_b32 s{{[0-9]+}}, 8 define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_set_midbit: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s4, 8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, 256 store i32 %x, ptr addrspace(1) %out ret void @@ -51,10 +106,27 @@ define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) { @gv = external addrspace(1) global i32 ; Make sure there's no verifier error with an undef source. -; SI-LABEL: {{^}}bitset_verifier_error: -; SI-NOT: %bb.1: -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff define void @bitset_verifier_error() local_unnamed_addr #0 { +; SI-LABEL: bitset_verifier_error: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_getpc_b64 s[4:5] +; SI-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s8, s4, 0x7fffffff +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, 0x3f7fbe77 +; SI-NEXT: v_cmp_ge_f32_e64 s[4:5], |s4|, v0 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %bb5 +; SI-NEXT: .LBB6_2: ; %bb6 bb: %i = call float @llvm.fabs.f32(float undef) #0 %i1 = bitcast float %i to i32 diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll index dc158028bd7b0596a45c5bbc15dfea9ec8b01a39..4b56b5e9d24f5c65367bf2b827b367478f249594 100644 --- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll +++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll @@ -1,48 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; SI-LABEL: {{^}}s_or_to_orn2: -; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_or_to_orn2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_orn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, -51 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_or_to_orn2_imm0: -; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_or_to_orn2_imm0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_orn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 -51, %in store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_and_to_andn2: -; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_and_to_andn2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_andn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, -51 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_and_to_andn2_imm0: -; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_and_to_andn2_imm0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_andn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 -51, %in store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_xor_to_xnor: -; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_xor_to_xnor: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xnor_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = xor i32 %in, -51 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_xor_to_xnor_imm0: -; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_xor_to_xnor_imm0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xnor_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = xor i32 -51, %in store i32 %x, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index 1151bde02ef62c360c539b49770b6e54d12b8ce8..41b61f2e09a3d326362441d34f9d7ce61259b06b 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -82,9 +82,9 @@ body: | # Regression test for src_modifiers on base u16 opcode # GCN-LABEL: name: vop3_u16 -# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec -# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec -# GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec +# GCN: %5:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec +# GCN: %7:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec +# GCN: %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec name: vop3_u16 tracksRegLiveness: true body: | @@ -96,11 +96,11 @@ body: | %2:vgpr_32 = COPY $vgpr2 %3:vgpr_32 = IMPLICIT_DEF %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec - %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec + %5:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, %4, 0, %3, 0, 0, implicit $exec %6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec - %7:vgpr_32 = V_ADD_NC_U16_e64 1, %6, 2, %5, 0, 0, implicit $exec + %7:vgpr_32 = V_ADD_NC_U16_fake16_e64 1, %6, 2, %5, 0, 0, implicit $exec %8:vgpr_32 = V_MOV_B32_dpp %3, %7, 1, 15, 15, 1, implicit $exec - %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec + %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec ... name: vop3p @@ -880,11 +880,11 @@ body: | # Check op_sel is all 0s when combining # GCN-LABEL: name: opsel_vop3 -# GCN: %4:vgpr_32 = V_ADD_I16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec -# GCN: %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec -# GCN: %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec -# GCN: %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec -# GCN: %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec +# GCN: %4:vgpr_32 = V_ADD_I16_fake16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec +# GCN: %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec +# GCN: %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec +# GCN: %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec +# GCN: %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec name: opsel_vop3 tracksRegLiveness: true body: | @@ -897,23 +897,23 @@ body: | ; Combine for op_sel:[0,0,0] %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec - %4:vgpr_32 = V_ADD_I16_e64 0, %3, 0, %1, 0, 0, implicit $exec + %4:vgpr_32 = V_ADD_I16_fake16_e64 0, %3, 0, %1, 0, 0, implicit $exec ; Do not combine for op_sel:[1,0,0] %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec - %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec + %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec ; Do not combine for op_sel:[0,1,0] %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec - %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec + %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec ; Do not combine for op_sel:[1,1,0] %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec - %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec + %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec ; Do not combine for op_sel:[0,0,1] (dst_op_sel only) %11:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec - %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec + %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec ... # Check op_sel is all 0s and op_sel_hi is all 1s when combining diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll index 32d5fa6e72d7918ae6cad252153f21168d487d89..f98124fe2ed73195ea500b87fc46c328a7aa8ab4 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -7,10 +8,25 @@ declare double @llvm.fabs.f64(double) readnone declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone -; FUNC-LABEL: {{^}}v_fabs_f64: -; SI: v_and_b32 -; SI: s_endpgm define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: v_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tidext = sext i32 %tid to i64 %gep = getelementptr double, ptr addrspace(1) %in, i64 %tidext @@ -20,75 +36,148 @@ define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}fabs_f64: -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) { +; SI-LABEL: fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in) store double %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { +; SI-LABEL: fabs_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s7, 31 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) store <2 x double> %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { +; SI-LABEL: fabs_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s7, 31 +; SI-NEXT: s_bitset0_b32 s11, 31 +; SI-NEXT: s_bitset0_b32 s9, 31 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) store <4 x double> %fabs, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}fabs_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { +; SI-LABEL: fabs_fold_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mul_f64 v[0:1], |s[6:7]|, v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}fabs_fn_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { +; SI-LABEL: fabs_fn_fold_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mul_f64 v[0:1], |s[6:7]|, v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call double @fabs(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_free_f64: -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: fabs_free_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %bc= bitcast i64 %in to double %fabs = call double @llvm.fabs.f64(double %bc) store double %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_fn_free_f64: -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: fabs_fn_free_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %bc= bitcast i64 %in to double %fabs = call double @fabs(double %bc) store double %fabs, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir index 265bdd0cf2f48ffdc62942b0b0c442fd23fc66df..30a24c675a76b681f897bbba600e29df14c8558f 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir @@ -1,6 +1,29 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s +# V_CVT_LT_F16 will be replaced with fake16 when its true16/fake16 profile is corrected + +--- +name: cmp_f16 +body: | + bb.0.entry: + ; GCN-LABEL: name: cmp_f16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_fake16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_fake16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:vgpr_32 = V_CVT_F16_U16_fake16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3:sreg_32 = COPY %2:vgpr_32 + nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode + %4:sreg_32_xm0_xexec = COPY $scc + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec +... + +# Needs extra shift instruction to select hi 16 bits --- name: cvt_hi_f32_f16 body: | diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 03a77dc2b8b5e2a0302ac00ba6cab522a7eac266..4604518d71c961b776bb6b8bd81bf1e6b068982f 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -1,20 +1,39 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s -# XFAIL: * -# FIXME-TRUE16. reenable after CVT_F16_U16_t16 is supported in CodeGen +# + +--- +name: cmp_f16 +body: | + bb.0.entry: + ; GCN-LABEL: name: cmp_f16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] + ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[COPY]], 0, [[DEF1]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + %0:vgpr_16 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %3:sreg_32 = COPY %2:vgpr_16 + nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode + %4:sreg_32_xm0_xexec = COPY $scc + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec +... --- name: cvt_hi_f32_f16 body: | bb.0: ; GCN-LABEL: name: cvt_hi_f32_f16 - ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CVT_F16_U16_e64_]], implicit $exec - ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[V_LSHRREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec %0:vgpr_16 = IMPLICIT_DEF - %1:vgpr_16 = V_CVT_F16_U16_t16_e64 %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec %2:sreg_32 = COPY %1:vgpr_16 %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode ... diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index 9a727a321d7869b8b2865e3df0675e2c23a9ada4..e8291f7ab8f729b80a55c34cde4033152ebbfa74 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -2,26 +2,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s ---- -name: cmp_f16 -body: | - bb.0.entry: - ; GCN-LABEL: name: cmp_f16 - ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec - %0:vgpr_32 = IMPLICIT_DEF - %1:sreg_32 = IMPLICIT_DEF - %2:vgpr_32 = V_CVT_F16_U16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec - %3:sreg_32 = COPY %2:vgpr_32 - nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode - %4:sreg_32_xm0_xexec = COPY $scc - %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec -... - --- name: fmac_f16 body: | diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index e5dcf9ce309cd8d4ec5a463fd8b301f42986654a..32cb1056022de2677c755315d6788021427a7bac 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -77,6 +77,29 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ret void } +define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate(ptr %ptr, double %data) #0 { + ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate + ; GFX90A_GFX940: bb.0 (%ir-block.0): + ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret void +} + define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 { ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -104,8 +127,36 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ret double %ret } +define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr, double %data) #0 { + ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw__noprivate + ; GFX90A_GFX940: bb.0 (%ir-block.0): + ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %ret +} + declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double) attributes #0 = { nounwind } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll new file mode 100644 index 0000000000000000000000000000000000000000..64bd4804ccd5191ec85a1d6d00925019d9e22ba8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -0,0 +1,6804 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_add_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_add_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_add_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_add_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_and_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_and_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_and_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_and_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_sub_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_sub_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_sub_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_sub_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_max_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_max_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_max_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_max_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umax_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umax_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umax_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umax_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_min_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_min_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_min_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_min_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umin_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umin_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umin_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umin_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_or_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_or_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_or_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_or_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { +; GFX7-LABEL: atomic_xchg_f64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_f64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr double, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { +; GFX7-LABEL: atomic_xchg_pointer_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_pointer_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_pointer_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr ptr, ptr %out, i32 4 + %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xor_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xor_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xor_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xor_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %in, i64 4 + %val = load atomic i64, ptr %gep seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %in, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %val = load atomic i64, ptr %gep seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %in, i64 %index + %val = load atomic i64, ptr %ptr seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { +; GFX7-LABEL: atomic_store_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s0, s2, 32 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_addc_u32 s1, s3, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s2, 32 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + store atomic i64 %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { +; GFX7-LABEL: atomic_store_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + store atomic i64 %in, ptr %out seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + store atomic i64 %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + store atomic i64 %in, ptr %ptr seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s2, s4, 32 +; GFX7-NEXT: s_addc_u32 s3, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s2, s4, 32 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_soffset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s2, s4, 0x11940 +; GFX7-NEXT: s_addc_u32 s3, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_soffset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s2, s4, 0x11940 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_soffset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 9000 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_add_u32 s0, s4, s2 +; GFX7-NEXT: s_addc_u32 s3, s5, s3 +; GFX7-NEXT: s_add_u32 s2, s0, 32 +; GFX7-NEXT: s_addc_u32 s3, s3, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s4, s2 +; GFX8-NEXT: s_addc_u32 s3, s5, s3 +; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: s_addc_u32 s3, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX7-NEXT: s_add_u32 s2, s4, s2 +; GFX7-NEXT: s_addc_u32 s3, s5, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX8-NEXT: s_add_u32 s2, s4, s2 +; GFX8-NEXT: s_addc_u32 s3, s5, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_f64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr double, ptr %in, i64 4 + %val = load atomic double, ptr %gep seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_f64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_f64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %in, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 + %val = load atomic double, ptr %gep seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_f64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %in, i64 %index + %val = load atomic double, ptr %ptr seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { +; GFX7-LABEL: atomic_store_f64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s0, s2, 32 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_addc_u32 s1, s3, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s2, 32 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr double, ptr %out, i64 4 + store atomic double %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { +; GFX7-LABEL: atomic_store_f64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + store atomic double %in, ptr %out seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_f64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %out, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 + store atomic double %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_f64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %out, i64 %index + store atomic double %in, ptr %ptr seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_inc_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_inc_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_incr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_incr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_incr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_ret_incr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_inc_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_inc_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_incr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_incr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_incr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_ret_incr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret_incr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_incr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_dec_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_dec_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_decr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_decr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_decr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_dec_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_dec_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_decr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_decr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_decr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_ret_decr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret_decr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_decr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +!0 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll new file mode 100644 index 0000000000000000000000000000000000000000..edd5620dc411282580492d0566db762b3deb93ae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -0,0 +1,9196 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s + +; --------------------------------------------------------------------- +; atomicrmw xchg +; --------------------------------------------------------------------- + +define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xchg f64 +; --------------------------------------------------------------------- + +define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret double %result +} + +; --------------------------------------------------------------------- +; atomicrmw add +; --------------------------------------------------------------------- + +define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw sub +; --------------------------------------------------------------------- + +define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw and +; --------------------------------------------------------------------- + +define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v4 +; GFX7-NEXT: v_not_b32_e32 v4, v8 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v8 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v4 +; GFX7-NEXT: v_not_b32_e32 v4, v8 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v8 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v0 +; GFX7-NEXT: v_not_b32_e32 v6, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX8-NEXT: v_not_b32_e32 v7, v0 +; GFX8-NEXT: v_not_b32_e32 v6, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v0 +; GFX7-NEXT: v_not_b32_e32 v6, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX8-NEXT: v_not_b32_e32 v7, v0 +; GFX8-NEXT: v_not_b32_e32 v6, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw or +; --------------------------------------------------------------------- + +define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw max +; --------------------------------------------------------------------- + +define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB80_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB80_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB81_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB81_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB82_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB82_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB83_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB83_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB84_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB84_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB85_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB85_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB86_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB86_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB87_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB87_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB88_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB88_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB89_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB89_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB90_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB90_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB91_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB91_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB92_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB92_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB93_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB93_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB94_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB94_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB95_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB95_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB96_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB96_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB97_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB97_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB98_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB98_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB99_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB99_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB100_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB100_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB101_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB101_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB102_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB102_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_umax_i64_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB103_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB103_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB104_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB104_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_umax_i64_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB105_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB105_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB106_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB106_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umin +; --------------------------------------------------------------------- + +define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB107_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB107_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB108_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB108_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB109_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB109_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB110_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB110_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB111_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB111_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB112_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB112_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB113_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB113_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB114_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB114_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB115_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB115_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB116_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB116_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw min +; --------------------------------------------------------------------- + +define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB117_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB117_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB118_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB118_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB119_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB119_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB120_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB120_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB121_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB121_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB122_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB122_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB123_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB123_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB123_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB124_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB124_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB124_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB125_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB125_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB125_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB126_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB126_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB126_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_min_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB127_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB127_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB127_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB128_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB128_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB128_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB129_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB129_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB130_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB130_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw uinc_wrap +; --------------------------------------------------------------------- + +define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw udec_wrap +; --------------------------------------------------------------------- + +define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +!0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index 81859dce04889d73a06885f3d62c1af6a8bf8740..064e88873a175a899cdec492f47e123cdb1519c6 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s ; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s ; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefixes=CAYMAN %s @@ -44,25 +45,45 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX11-LABEL: test_convert_fp16_to_fp32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ; ; CYPRESS-LABEL: test_convert_fp16_to_fp32: ; CYPRESS: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index c17be87834aeb714ceb82ba3ffa6ee433a07fba4..6c9f451167b7b0185dbdb25eb434f93d652efae5 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s + declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone @@ -44,27 +46,49 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX11-LABEL: test_convert_fp16_to_fp64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in, align 2 %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone store double %cvt, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index d8a726f251a01e501dc58a464ba2b34bde56ac7b..5bac710070477f9ef58aaa7607d7c7f43d3e0f70 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone @@ -43,25 +44,45 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX11-LABEL: test_convert_fp32_to_fp16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_convert_fp32_to_fp16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ; ; CYPRESS-LABEL: test_convert_fp32_to_fp16: ; CYPRESS: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 75f4dff14fcbd4d37d58d22a0f12f8e74a311fdd..a40d678e84d72893be9eb8ff165bd7fc0cce3561 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: @@ -59,25 +60,45 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -145,27 +166,49 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_f16_to_f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_f16_to_f64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_f16_to_f64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -234,28 +277,51 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_v2f16_to_v2f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -330,31 +396,57 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_v2f16_to_v2f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -387,19 +479,35 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; -; GFX11-LABEL: s_fneg_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: s_fneg_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm entry: %a.trunc = trunc i32 %a to i16 %a.val = bitcast i16 %a.trunc to half @@ -463,25 +571,45 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -547,25 +675,45 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -631,25 +779,45 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -730,29 +898,55 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -833,29 +1027,55 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -v0.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -935,29 +1155,55 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1038,29 +1284,55 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, |v0.l|, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, |v0.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, |v0|, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1140,29 +1412,55 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1244,29 +1542,55 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -|v0.l|, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -|v0|, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 6e8e6c0721789555721fe26b62765ac93482a9fb..786fe03164690ee202567921064132342a3f9806 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -136,12 +136,12 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 -; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v2, 20, v5 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff ; GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v2 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -508,12 +508,12 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 -; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v2, 20, v5 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff ; GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v2 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 0e12cca1900ce62a913a81aa85722e3f30b0c029..327f2653c47462fae141d03fe9779d9af7af66c2 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s + define amdgpu_kernel void @fptosi_f16_to_i16( ; SI-LABEL: fptosi_f16_to_i16: @@ -41,25 +43,45 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +130,49 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -182,28 +226,51 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -259,31 +326,60 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_v2f16_to_v2i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_i16_f16_e32 v1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -337,31 +433,57 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_v2f16_to_v2i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -422,34 +544,63 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_v2f16_to_v2i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v1 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -485,21 +636,38 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i1: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i1: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i1: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm entry: %conv = fptosi half %in to i1 store i1 %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index abc5c7af13b0ce5d1c6bcd31db88194060eea27d..ba540f4948b50a13e292070e874a698a32c84dfd 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s + define amdgpu_kernel void @fptoui_f16_to_i16( ; SI-LABEL: fptoui_f16_to_i16: @@ -41,25 +43,45 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +130,49 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -182,28 +226,51 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -258,31 +325,60 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_v2f16_to_v2i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u16_f16_e32 v1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -336,31 +432,57 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_v2f16_to_v2i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -421,33 +543,61 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_v2f16_to_v2i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -484,21 +634,38 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i1: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i1: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i1: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm entry: %conv = fptoui half %in to i1 store i1 %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll index c62b4e565078e423157e8441db82e9b0290bc4ce..2e2a1094ba99ae5b0e88cd9db5a0fcd52e703328 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll @@ -996,7 +996,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec + ; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) ; GISEL-GFX11-NEXT: S_ENDPGM 0 @@ -1020,7 +1020,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg ; DAGISEL-GFX11-WF32-NEXT: {{ $}} ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) @@ -1032,7 +1032,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg ; DAGISEL-GFX11-WF64-NEXT: {{ $}} ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index 38d928a006fb206046d93e3293f5048bca57b586..2999ddb8315883eb08900e351051620bad147988 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -673,38 +673,38 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v6, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v6, v5 -; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2 -; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3 -; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 -; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 -; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 -; GISEL-NEXT: v_ffbh_u32_e32 v7, v2 -; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 -; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_xor_b32_e32 v4, v6, v2 +; GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6 +; GISEL-NEXT: v_xor_b32_e32 v5, v6, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v6, vcc +; GISEL-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v1, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v0, v3 +; GISEL-NEXT: v_add_u32_e32 v1, 32, v1 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v4 +; GISEL-NEXT: v_min_u32_e32 v0, v0, v1 +; GISEL-NEXT: v_ffbh_u32_e32 v1, v5 ; GISEL-NEXT: v_add_u32_e32 v7, 32, v7 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 -; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v5, v4, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_add_u32_e32 v0, 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v1, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc ; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9 ; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 ; GISEL-NEXT: ; implicit-def: $vgpr10 -; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else -; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: v_add_u32_e32 v4, 0xffffffb5, v9 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v4, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr2 ; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; %bb.3: ; %Flow3 ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] @@ -721,89 +721,88 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v9 ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v14, v[2:3] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] ; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 -; GISEL-NEXT: v_or_b32_e32 v10, v4, v10 -; GISEL-NEXT: v_or_b32_e32 v11, v5, v11 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3] -; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[2:3] +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[4:5] +; GISEL-NEXT: v_or_b32_e32 v10, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v11, v1, v11 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v15, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_add_u32_e32 v9, 55, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GISEL-NEXT: v_add_u32_e32 v14, 55, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v4, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc -; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1 -; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 -; GISEL-NEXT: v_or_b32_e32 v16, v9, v11 -; GISEL-NEXT: v_or_b32_e32 v17, v10, v12 -; GISEL-NEXT: v_lshrrev_b64 v[11:12], v15, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 -; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 -; GISEL-NEXT: v_and_or_b32 v0, v11, v0, v2 -; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v0, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v1, v3, s[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v9, -1 +; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v9 +; GISEL-NEXT: v_or_b32_e32 v16, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v17, v1, v13 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_and_or_b32 v0, v9, v2, v0 +; GISEL-NEXT: v_and_or_b32 v1, v12, v3, v1 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: v_or_b32_e32 v9, v14, v0 +; GISEL-NEXT: v_mov_b32_e32 v2, v9 +; GISEL-NEXT: v_mov_b32_e32 v3, v10 +; GISEL-NEXT: v_mov_b32_e32 v4, v11 +; GISEL-NEXT: v_mov_b32_e32 v5, v12 ; GISEL-NEXT: .LBB2_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB2_8: ; %Flow2 ; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; GISEL-NEXT: s_cbranch_execz .LBB2_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] -; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GISEL-NEXT: v_or_b32_e32 v11, v2, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v9 -; GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GISEL-NEXT: v_mov_b32_e32 v2, v11 -; GISEL-NEXT: v_mov_b32_e32 v3, v12 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[2:3] +; GISEL-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v4, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_mov_b32_e32 v4, v2 +; GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: .LBB2_10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 -; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_lshrrev_b64 v[0:1], 2, v[2:3] ; GISEL-NEXT: v_mov_b32_e32 v9, 0 -; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v1 +; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] -; GISEL-NEXT: v_lshl_or_b32 v10, v2, 30, v5 +; GISEL-NEXT: v_lshl_or_b32 v10, v4, 30, v1 ; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_lshrrev_b64 v[0:1], 3, v[2:3] ; GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GISEL-NEXT: v_lshl_or_b32 v10, v2, 29, v5 +; GISEL-NEXT: v_lshl_or_b32 v10, v4, 29, v1 ; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB2_13: ; %Flow4 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 -; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff -; GISEL-NEXT: v_lshl_add_u32 v1, v7, 20, v1 -; GISEL-NEXT: v_and_or_b32 v2, v10, v2, v0 -; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0 -; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0 +; GISEL-NEXT: v_and_b32_e32 v1, 0x80000000, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0x3ff00000 +; GISEL-NEXT: v_mov_b32_e32 v3, 0xfffff +; GISEL-NEXT: v_lshl_add_u32 v2, v7, 20, v2 +; GISEL-NEXT: v_and_or_b32 v1, v10, v3, v1 +; GISEL-NEXT: v_or3_b32 v1, v1, v2, 0 ; GISEL-NEXT: .LBB2_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1083,7 +1082,6 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 ; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9 -; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0 ; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 ; GISEL-NEXT: .LBB3_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index d9227724c22a14712319bc4333078bf65fe4581a..855ca390aabdceee56b75d759112058987349cfe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -1,12 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { +; VERDE-LABEL: buffer_store: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VERDE-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +; VERDE-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CHECK-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +; CHECK-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1) @@ -14,34 +23,54 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { +; VERDE-LABEL: buffer_store_immoffs: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_immoffs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { +; VERDE-LABEL: buffer_store_ofs: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_ofs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) ret void } ; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { +; VERDE-LABEL: buffer_store_wait: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt expcnt(0) +; VERDE-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_wait: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0) @@ -49,29 +78,48 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %offset) { +; VERDE-LABEL: buffer_store_x1: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_x2: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offen_merged_and: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offen_merged_and: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -87,11 +135,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg % ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offen_merged_or: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offen_merged_or: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_endpgm %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -109,12 +166,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %r } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offen_merged_glc_slc: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offen_merged_glc_slc: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; CHECK-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; CHECK-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -130,10 +195,16 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inr ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +; VERDE-LABEL: buffer_store_x2_offen_merged_and: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2_offen_merged_and: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) @@ -141,10 +212,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg % ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { +; VERDE-LABEL: buffer_store_x2_offen_merged_or: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2_offen_merged_or: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_endpgm %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -153,11 +232,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %r ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offset_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offset_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) @@ -167,21 +253,35 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsr ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 define amdgpu_ps void @buffer_store_x2_offset_merged(ptr addrspace(8) inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +; VERDE-LABEL: buffer_store_x2_offset_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2_offset_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_int: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc -;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) { +; VERDE-LABEL: buffer_store_int: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VERDE-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc +; VERDE-NEXT: buffer_store_dword v6, off, s[0:3], 0 slc +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_int: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CHECK-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], 0 slc +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1) @@ -189,12 +289,18 @@ main_body: ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_byte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_byte: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VERDE-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_byte: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -202,12 +308,18 @@ main_body: ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_short: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_short(ptr addrspace(8) inreg %rsrc, float %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_short: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VERDE-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_short: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -215,12 +327,16 @@ main_body: ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_f16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, i32 %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 %cast = bitcast i16 %trunc to half @@ -228,74 +344,169 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_v2f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v2f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 +; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v4f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v8f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v8f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v7, v7 +; VERDE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; VERDE-NEXT: v_cvt_f16_f32_e32 v9, v5 +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; VERDE-NEXT: v_or_b32_e32 v5, v6, v5 +; VERDE-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v4, v4, v6 +; VERDE-NEXT: v_or_b32_e32 v3, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v2, v0, v1 +; VERDE-NEXT: buffer_store_dwordx4 v[2:5], v8, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v8f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v2bf16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2bf16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VERDE-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v2bf16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4bf16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4bf16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VERDE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VERDE-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; VERDE-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v4bf16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_i16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, i32 %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 %trunc, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v2i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2i16(ptr addrspace(8) inreg %rsrc, <2 x i16> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v2i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v4i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -307,21 +518,45 @@ main_body: ; call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ; ret void ; } - -;CHECK-LABEL: {{^}}buffer_store_v8i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v8i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VERDE-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VERDE-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v6, v6, v7 +; VERDE-NEXT: v_or_b32_e32 v5, v4, v5 +; VERDE-NEXT: v_or_b32_e32 v4, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v3, v0, v1 +; VERDE-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v8i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) @@ -331,14 +566,26 @@ define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) in ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_swizzled_not_merged: -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_swizzled_not_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; VERDE-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 +; VERDE-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 +; VERDE-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 +; VERDE-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:28 +; VERDE-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:28 +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 8) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 8) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..5ff2d82c1464f2e3f9663e2a34401f8d209944ce --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple amdgcn | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: naked$local: +; CHECK-NEXT: .type naked$local,@function +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, main@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, main@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: normal$local: +; CHECK-NEXT: .type normal$local,@function +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_waitcnt expcnt(0) +; CHECK-NEXT: v_writelane_b32 v40, s16, 2 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, main@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, main@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll index c7987d3d009175fae9d5d6006b4a5cf3dd93eb70..02641f5b6ae8c1ea9370ceb94337d9bf60518b7e 100644 --- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -1,40 +1,118 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}s_mulk_i32_k0: -; SI: s_load_dword [[VAL:s[0-9]+]] -; SI: s_mulk_i32 [[VAL]], 0x41 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[VRESULT]] -; SI: s_endpgm +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s + define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: s_mulk_i32_k0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s4, 0x41 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_mulk_i32_k0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s4, 0x41 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, 65 store i32 %mul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_mulk_i32_k1: -; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}} -; SI: s_endpgm define amdgpu_kernel void @s_mulk_i32_k1(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: s_mulk_i32_k1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s4, 0x7fff +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_mulk_i32_k1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s4, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, 32767 ; (1 << 15) - 1 store i32 %mul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_mulk_i32_k2: -; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}} -; SI: s_endpgm define amdgpu_kernel void @s_mulk_i32_k2(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: s_mulk_i32_k2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s4, 0xffef +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_mulk_i32_k2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s4, 0xffef +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, -17 store i32 %mul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}no_s_mulk_i32_k0: -; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}} -; SI: s_endpgm define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: no_s_mulk_i32_k0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mul_i32 s4, s4, 0x8001 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: no_s_mulk_i32_k0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mul_i32 s4, s4, 0x8001 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, 32769 ; 1 << 15 + 1 store i32 %mul, ptr addrspace(1) %out ret void @@ -42,9 +120,28 @@ define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { @lds = addrspace(3) global [512 x i32] undef, align 4 -; SI-LABEL: {{^}}commute_s_mulk_i32: -; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}} define amdgpu_kernel void @commute_s_mulk_i32(ptr addrspace(1) %out, i32 %b) #0 { +; GFX6-LABEL: commute_s_mulk_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x2 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s0, 0x800 +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; foo v0, s0 +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: commute_s_mulk_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s0, 0x800 +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: ; foo v0, s0 +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_endpgm %size = call i32 @llvm.amdgcn.groupstaticsize() %add = mul i32 %size, %b call void asm sideeffect "; foo $0, $1", "v,s"(ptr addrspace(3) @lds, i32 %add) diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index b08a35ab8073246a3e5961b80f9e4f26f7ad1166..9169433cdca56fae8fc48715009a815209145f67 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: @@ -41,25 +42,45 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_i16_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_i16_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_i16_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +129,49 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_i32_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_i32_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_i32_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -186,29 +229,56 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_v2i16_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_i16_e32 v1, v1 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_v2i16_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -264,31 +334,60 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_v2i32_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_v2i32_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -353,37 +452,69 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: s_sint_to_fp_i1_to_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: s_sint_to_fp_i1_to_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index c21ae434f447099b8fce3b64be419b280bae0cbd..c4268c15d9db668de7c120d1fd7448005743cb97 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: @@ -41,25 +42,45 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_i16_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_i16_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_i16_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +129,49 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_i32_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_i32_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_i32_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -186,29 +229,56 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_v2i16_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_v2i16_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -264,31 +334,60 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_v2i32_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_v2i32_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -353,37 +452,69 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: s_uint_to_fp_i1_to_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 diff --git a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll index 233230e416c137f9765696654df29de5a651b464..71a1678669a80728f8ad6859eeee48d2c79002b9 100644 --- a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll +++ b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll @@ -9,7 +9,7 @@ @oStruct = external global %struct.Outer, align 4 -define void @main(i8 %val8) nounwind { +define void @main(i8 %val8) nounwind "frame-pointer"="none" { ; CHECK-LABEL: main: ; CHECK: @ %bb.0: @ %for.body.lr.ph ; CHECK-NEXT: movw r0, :lower16:(L_oStruct$non_lazy_ptr-(LPC0_0+4)) diff --git a/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll b/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll index 6728b9d4584c2677842fda4546826ba025564e1c..af6ccdc8f4e191f7102f87f58d5a923074b79da3 100644 --- a/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll +++ b/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll @@ -3,7 +3,7 @@ ; Radar 10567930: Make sure that all the caller-saved registers are saved and ; restored in a function with setjmp/longjmp EH. In particular, r6 was not ; being saved here. -; CHECK: push {r4, r5, r6, r7, lr} +; CHECK: push.w {r4, r5, r6, r7, r8, r10, r11, lr} %0 = type opaque %struct.NSConstantString = type { ptr, i32, ptr, i32 } diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll index aa79e4156dac1155e6bef30f6e848ba90cc7e009..b6adc995091ceaba56b0d7b07bf7fe16e64ae239 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -1732,7 +1732,7 @@ if.end: ; Another infinite loop test this time with two nested infinite loop. ; infiniteloop3 ; bx lr -define void @infiniteloop3() "frame-pointer"="all" { +define void @infiniteloop3() "frame-pointer"="none" { ; ARM-LABEL: infiniteloop3: ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: mov r0, #0 diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll index 14e49bf3c9376aeb8a8469843ea05f5bd03bd789..560dfde356c29d9e393ebf86aea6b7184f8a3a2f 100644 --- a/llvm/test/CodeGen/ARM/atomic-load-store.ll +++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll @@ -324,18 +324,17 @@ define void @test_old_store_64bit(ptr %p, i64 %v) { ; ; ARMOPTNONE-LABEL: test_old_store_64bit: ; ARMOPTNONE: @ %bb.0: -; ARMOPTNONE-NEXT: push {r4, r5, r7, lr} -; ARMOPTNONE-NEXT: add r7, sp, #8 -; ARMOPTNONE-NEXT: push {r8, r10, r11} -; ARMOPTNONE-NEXT: sub sp, sp, #24 -; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill -; ARMOPTNONE-NEXT: dmb ish -; ARMOPTNONE-NEXT: ldr r1, [r0] -; ARMOPTNONE-NEXT: ldr r0, [r0, #4] -; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill +; ARMOPTNONE-NEXT: push {r4, r5, r7, r8, r10, r11, lr} +; ARMOPTNONE-NEXT: add r7, sp, #20 +; ARMOPTNONE-NEXT: sub sp, sp, #24 +; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: ldr r1, [r0] +; ARMOPTNONE-NEXT: ldr r0, [r0, #4] +; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill ; ARMOPTNONE-NEXT: b LBB5_1 ; ARMOPTNONE-NEXT: LBB5_1: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1 @@ -382,8 +381,7 @@ define void @test_old_store_64bit(ptr %p, i64 %v) { ; ARMOPTNONE-NEXT: LBB5_5: @ %atomicrmw.end ; ARMOPTNONE-NEXT: dmb ish ; ARMOPTNONE-NEXT: sub sp, r7, #20 -; ARMOPTNONE-NEXT: pop {r8, r10, r11} -; ARMOPTNONE-NEXT: pop {r4, r5, r7, pc} +; ARMOPTNONE-NEXT: pop {r4, r5, r7, r8, r10, r11, pc} ; ; THUMBTWO-LABEL: test_old_store_64bit: ; THUMBTWO: @ %bb.0: @@ -864,20 +862,19 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { ; ; ARMOPTNONE-LABEL: store_atomic_f64__seq_cst: ; ARMOPTNONE: @ %bb.0: -; ARMOPTNONE-NEXT: push {r4, r5, r7, lr} -; ARMOPTNONE-NEXT: add r7, sp, #8 -; ARMOPTNONE-NEXT: push {r8, r10, r11} -; ARMOPTNONE-NEXT: sub sp, sp, #24 -; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill -; ARMOPTNONE-NEXT: vmov d16, r1, r2 -; ARMOPTNONE-NEXT: vmov r1, r2, d16 -; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill -; ARMOPTNONE-NEXT: dmb ish -; ARMOPTNONE-NEXT: ldr r1, [r0] -; ARMOPTNONE-NEXT: ldr r0, [r0, #4] -; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill +; ARMOPTNONE-NEXT: push {r4, r5, r7, r8, r10, r11, lr} +; ARMOPTNONE-NEXT: add r7, sp, #20 +; ARMOPTNONE-NEXT: sub sp, sp, #24 +; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill +; ARMOPTNONE-NEXT: vmov d16, r1, r2 +; ARMOPTNONE-NEXT: vmov r1, r2, d16 +; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill +; ARMOPTNONE-NEXT: dmb ish +; ARMOPTNONE-NEXT: ldr r1, [r0] +; ARMOPTNONE-NEXT: ldr r0, [r0, #4] +; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill ; ARMOPTNONE-NEXT: b LBB13_1 ; ARMOPTNONE-NEXT: LBB13_1: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1 @@ -924,8 +921,7 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { ; ARMOPTNONE-NEXT: LBB13_5: @ %atomicrmw.end ; ARMOPTNONE-NEXT: dmb ish ; ARMOPTNONE-NEXT: sub sp, r7, #20 -; ARMOPTNONE-NEXT: pop {r8, r10, r11} -; ARMOPTNONE-NEXT: pop {r4, r5, r7, pc} +; ARMOPTNONE-NEXT: pop {r4, r5, r7, r8, r10, r11, pc} ; ; THUMBTWO-LABEL: store_atomic_f64__seq_cst: ; THUMBTWO: @ %bb.0: diff --git a/llvm/test/CodeGen/ARM/call-tc.ll b/llvm/test/CodeGen/ARM/call-tc.ll index 18d83bdc03e22f8113d67304e04edd7089da2d2f..9c70bac0322fe1b818f9a37b3fab2a6dea8a6b6d 100644 --- a/llvm/test/CodeGen/ARM/call-tc.ll +++ b/llvm/test/CodeGen/ARM/call-tc.ll @@ -17,7 +17,7 @@ define void @t1() "frame-pointer"="all" { ret void } -define void @t2() "frame-pointer"="all" { +define void @t2() "frame-pointer"="none" { ; CHECKV6-LABEL: t2: ; CHECKV6: bx r0 ; CHECKT2D-LABEL: t2: @@ -102,7 +102,7 @@ bb: ; Make sure codegenprep is duplicating ret instructions to enable tail calls. ; rdar://11140249 -define i32 @t8(i32 %x) nounwind ssp "frame-pointer"="all" { +define i32 @t8(i32 %x) nounwind ssp "frame-pointer"="none" { entry: ; CHECKT2D-LABEL: t8: ; CHECKT2D-NOT: push diff --git a/llvm/test/CodeGen/ARM/debug-frame.ll b/llvm/test/CodeGen/ARM/debug-frame.ll index faeafdf45dc39226871b78c998562b4897edbd8d..72e7cfcab487a76bdcaac5e570d188eaa47a96ab 100644 --- a/llvm/test/CodeGen/ARM/debug-frame.ll +++ b/llvm/test/CodeGen/ARM/debug-frame.ll @@ -526,7 +526,7 @@ entry: ; Test 4 ;------------------------------------------------------------------------------- -define void @test4() nounwind { +define void @test4() nounwind "frame-pointer"="none" { entry: ret void } diff --git a/llvm/test/CodeGen/ARM/ehabi.ll b/llvm/test/CodeGen/ARM/ehabi.ll index fea497076030f1456a48fc3d9e5e67800aae5a07..d1a4e9a6bccad046368d21f12b5dbb6047f5c177 100644 --- a/llvm/test/CodeGen/ARM/ehabi.ll +++ b/llvm/test/CodeGen/ARM/ehabi.ll @@ -575,7 +575,7 @@ entry: ; Test 4 ;------------------------------------------------------------------------------- -define void @test4() nounwind { +define void @test4() nounwind "frame-pointer"="none" { entry: ret void } diff --git a/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll b/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll index e29ddd52f3d024c88e6e50893886e3f430f887e9..8a7bfbe4290026f27ce7d1f0fc97ea31abf9de44 100644 --- a/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll +++ b/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll @@ -16,7 +16,7 @@ entry: ; DARWIN-THUMB2: mov r0, r7 ; LINUX-ARM-LABEL: frameaddr_index0: -; LINUX-ARM: push {r11, lr} +; LINUX-ARM: push {r11} ; LINUX-ARM: mov r11, sp ; LINUX-ARM: mov r0, r11 @@ -42,7 +42,7 @@ entry: ; DARWIN-THUMB2: ldr r0, [r7] ; LINUX-ARM-LABEL: frameaddr_index1: -; LINUX-ARM: push {r11, lr} +; LINUX-ARM: push {r11} ; LINUX-ARM: mov r11, sp ; LINUX-ARM: ldr r0, [r11] @@ -73,7 +73,7 @@ entry: ; DARWIN-THUMB2: ldr r0, [r0] ; LINUX-ARM-LABEL: frameaddr_index3: -; LINUX-ARM: push {r11, lr} +; LINUX-ARM: push {r11} ; LINUX-ARM: mov r11, sp ; LINUX-ARM: ldr r0, [r11] ; LINUX-ARM: ldr r0, [r0] diff --git a/llvm/test/CodeGen/ARM/frame-chain.ll b/llvm/test/CodeGen/ARM/frame-chain.ll index e37213e4aaf8b8ad9d5797b73c2d29df9454dd70..7b722cd5fcef2465cfa2e16292be540c6d000553 100644 --- a/llvm/test/CodeGen/ARM/frame-chain.ll +++ b/llvm/test/CodeGen/ARM/frame-chain.ll @@ -10,11 +10,14 @@ define dso_local noundef i32 @leaf(i32 noundef %0) { ; LEAF-FP-LABEL: leaf: ; LEAF-FP: @ %bb.0: -; LEAF-FP-NEXT: .pad #4 -; LEAF-FP-NEXT: sub sp, sp, #4 -; LEAF-FP-NEXT: str r0, [sp] +; LEAF-FP-NEXT: .save {r11, lr} +; LEAF-FP-NEXT: push {r11, lr} +; LEAF-FP-NEXT: .setfp r11, sp +; LEAF-FP-NEXT: mov r11, sp +; LEAF-FP-NEXT: push {r0} ; LEAF-FP-NEXT: add r0, r0, #4 -; LEAF-FP-NEXT: add sp, sp, #4 +; LEAF-FP-NEXT: mov sp, r11 +; LEAF-FP-NEXT: pop {r11, lr} ; LEAF-FP-NEXT: mov pc, lr ; ; LEAF-FP-AAPCS-LABEL: leaf: diff --git a/llvm/test/CodeGen/ARM/ifcvt5.ll b/llvm/test/CodeGen/ARM/ifcvt5.ll index dc9a3400b691acc88016099d4acf7117a01fe4a3..30a92eb34989a690baece62b0c2b8066ebedbc5e 100644 --- a/llvm/test/CodeGen/ARM/ifcvt5.ll +++ b/llvm/test/CodeGen/ARM/ifcvt5.ll @@ -5,7 +5,7 @@ @x = external global ptr ; [#uses=1] -define void @foo(i32 %a) "frame-pointer"="all" { +define void @foo(i32 %a) "frame-pointer"="none" { ; A8-LABEL: foo: ; A8: @ %bb.0: @ %entry ; A8-NEXT: movw r1, :lower16:(L_x$non_lazy_ptr-(LPC0_0+8)) diff --git a/llvm/test/CodeGen/ARM/ldrd.ll b/llvm/test/CodeGen/ARM/ldrd.ll index cf5c2dfe5ef60baccb4217c5ba1b0ea81df01e0c..3cf10f0e64b4d11510a927b797d2c7d9606c6a21 100644 --- a/llvm/test/CodeGen/ARM/ldrd.ll +++ b/llvm/test/CodeGen/ARM/ldrd.ll @@ -168,7 +168,7 @@ define void @ldrd_postupdate_inc(ptr %p0) "frame-pointer"="all" { ; NORMAL: strd r1, r2, [r0], #-8 ; CONSERVATIVE-NOT: strd ; CHECK: bx lr -define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all" { +define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="none" { %p0.1 = getelementptr i32, ptr %p0, i32 1 store i32 %v0, ptr %p0 store i32 %v1, ptr %p0.1 @@ -180,7 +180,7 @@ define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all" ; NORMAL: strd r1, r2, [r0], #8 ; CONSERVATIVE-NOT: strd ; CHECK: bx lr -define ptr @strd_postupdate_inc(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all" { +define ptr @strd_postupdate_inc(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="none" { %p0.1 = getelementptr i32, ptr %p0, i32 1 store i32 %v0, ptr %p0 store i32 %v1, ptr %p0.1 diff --git a/llvm/test/CodeGen/ARM/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/ARM/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..2bdc7d3e29b981af615f47c87d8c71c5c2cf94a8 --- /dev/null +++ b/llvm/test/CodeGen/ARM/naked-fn-with-frame-pointer.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple arm | FileCheck %s -check-prefixes=CHECK-ALE +; RUN: llc < %s -mtriple armeb | FileCheck %s -check-prefixes=CHECK-ABE +; RUN: llc < %s -mtriple thumb | FileCheck %s -check-prefixes=CHECK-TLE +; RUN: llc < %s -mtriple thumbeb | FileCheck %s -check-prefixes=CHECK-TBE + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-ALE-LABEL: naked: +; CHECK-ALE: @ %bb.0: +; CHECK-ALE-NEXT: bl main +; +; CHECK-ABE-LABEL: naked: +; CHECK-ABE: @ %bb.0: +; CHECK-ABE-NEXT: bl main +; +; CHECK-TLE-LABEL: naked: +; CHECK-TLE: @ %bb.0: +; CHECK-TLE-NEXT: bl main +; +; CHECK-TBE-LABEL: naked: +; CHECK-TBE: @ %bb.0: +; CHECK-TBE-NEXT: bl main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-ALE-LABEL: normal: +; CHECK-ALE: @ %bb.0: +; CHECK-ALE-NEXT: push {r11, lr} +; CHECK-ALE-NEXT: mov r11, sp +; CHECK-ALE-NEXT: bl main +; +; CHECK-ABE-LABEL: normal: +; CHECK-ABE: @ %bb.0: +; CHECK-ABE-NEXT: push {r11, lr} +; CHECK-ABE-NEXT: mov r11, sp +; CHECK-ABE-NEXT: bl main +; +; CHECK-TLE-LABEL: normal: +; CHECK-TLE: @ %bb.0: +; CHECK-TLE-NEXT: push {r7, lr} +; CHECK-TLE-NEXT: add r7, sp, #0 +; CHECK-TLE-NEXT: bl main +; +; CHECK-TBE-LABEL: normal: +; CHECK-TBE: @ %bb.0: +; CHECK-TBE-NEXT: push {r7, lr} +; CHECK-TBE-NEXT: add r7, sp, #0 +; CHECK-TBE-NEXT: bl main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll b/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll index c76dc24bae7e897c4656fcfd958f73b84f30a027..ea059e49c9f9eee2ebee0bf6279bb7081424d816 100644 --- a/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll +++ b/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll @@ -51,7 +51,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #0 ; BOTH: [SP-8]{{.+}}8{{.+}}4 ; DEBUG: a @ dot.c:13 ; STRIPPED-NOT: a @ dot.c:13 -define void @cleanup_array(ptr %0) #1 { +define void @cleanup_array(ptr %0) #3 { %2 = alloca ptr, align 8 store ptr %0, ptr %2, align 8 call void @llvm.dbg.declare(metadata ptr %2, metadata !41, metadata !DIExpression()), !dbg !46 @@ -62,7 +62,7 @@ define void @cleanup_array(ptr %0) #1 { ; BOTH: [SP-8]{{.+}}8{{.+}}4 ; DEBUG: res @ dot.c:21 ; STRIPPED-NOT: res @ dot.c:21 -define void @cleanup_result(ptr %0) #1 { +define void @cleanup_result(ptr %0) #3 { %2 = alloca ptr, align 8 store ptr %0, ptr %2, align 8 call void @llvm.dbg.declare(metadata ptr %2, metadata !47, metadata !DIExpression()), !dbg !51 @@ -92,7 +92,7 @@ define void @cleanup_result(ptr %0) #1 { ; BOTH: [SP-40]{{.+}}4{{.+}}4 ; DEBUG: i @ dot.c:55 ; STRIPPED-NOT: i @ dot.c:55 -define i32 @do_work(ptr %0, ptr %1, ptr %2) #1 { +define i32 @do_work(ptr %0, ptr %1, ptr %2) #3 { %4 = alloca i32, align 4 %5 = alloca ptr, align 8 %6 = alloca ptr, align 8 @@ -144,7 +144,7 @@ define i32 @do_work(ptr %0, ptr %1, ptr %2) #1 { ; BOTH: [SP-20]{{.+}}4{{.*}}4 ; DEBUG: i @ dot.c:69 ; STRIPPED-NOT: i @ dot.c:69 -define ptr @gen_array(i32 %0) #1 { +define ptr @gen_array(i32 %0) #3 { %2 = alloca ptr, align 8 %3 = alloca i32, align 4 %4 = alloca ptr, align 8 @@ -227,6 +227,7 @@ uselistorder ptr @llvm.dbg.declare, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn } attributes #1 = { "frame-pointer"="all" } attributes #2 = { ssp "stack-protector-buffer-size"="5" "frame-pointer"="all" } +attributes #3 = { "frame-pointer"="none" } !llvm.dbg.cu = !{!0, !2} !llvm.module.flags = !{!18, !19, !20, !21, !22, !23, !24} diff --git a/llvm/test/CodeGen/ARM/stack-size-section.ll b/llvm/test/CodeGen/ARM/stack-size-section.ll index fb23e358d856ee4cd1b13d580c42cf25147186c8..8272389719a69121cd81dcfcccd453e9e2377162 100644 --- a/llvm/test/CodeGen/ARM/stack-size-section.ll +++ b/llvm/test/CodeGen/ARM/stack-size-section.ll @@ -29,4 +29,4 @@ define void @dynalloc(i32 %N) #0 { ret void } -attributes #0 = { "frame-pointer"="all" } +attributes #0 = { "frame-pointer"="none" } diff --git a/llvm/test/CodeGen/ARM/swifterror.ll b/llvm/test/CodeGen/ARM/swifterror.ll index 4f950ba68760802996e8ef10ca42fedb77f0034f..f002c54fc60c0f68294a85e14d7d2344149b42b6 100644 --- a/llvm/test/CodeGen/ARM/swifterror.ll +++ b/llvm/test/CodeGen/ARM/swifterror.ll @@ -79,18 +79,17 @@ define float @caller(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r8} -; CHECK-O0-NEXT: sub sp, sp, #12 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #12 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: bl _foo -; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: movw r0, #0 -; CHECK-O0-NEXT: cmp r8, r0 -; CHECK-O0-NEXT: bne LBB1_2 +; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: bl _foo +; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: movw r0, #0 +; CHECK-O0-NEXT: cmp r8, r0 +; CHECK-O0-NEXT: bne LBB1_2 ; CHECK-O0-NEXT: @ %bb.1: @ %cont ; CHECK-O0-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload @@ -101,8 +100,7 @@ define float @caller(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r8} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -176,12 +174,11 @@ define float @caller2(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller2: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r8} -; CHECK-O0-NEXT: sub sp, sp, #16 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #16 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-O0-NEXT: LBB2_1: @ %bb_loop ; CHECK-O0-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-O0-NEXT: mov r8, #0 @@ -209,8 +206,7 @@ define float @caller2(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r8} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller2: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -585,21 +581,20 @@ define float @caller3(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller3: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r8} -; CHECK-O0-NEXT: sub sp, sp, #44 -; CHECK-O0-NEXT: bfc sp, #0, #3 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #44 +; CHECK-O0-NEXT: bfc sp, #0, #3 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: add r0, sp, #16 -; CHECK-O0-NEXT: mov r1, #1 -; CHECK-O0-NEXT: bl _foo_sret -; CHECK-O0-NEXT: str r8, [sp, #8] @ 4-byte Spill -; CHECK-O0-NEXT: movw r0, #0 -; CHECK-O0-NEXT: cmp r8, r0 -; CHECK-O0-NEXT: bne LBB6_2 +; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: add r0, sp, #16 +; CHECK-O0-NEXT: mov r1, #1 +; CHECK-O0-NEXT: bl _foo_sret +; CHECK-O0-NEXT: str r8, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: movw r0, #0 +; CHECK-O0-NEXT: cmp r8, r0 +; CHECK-O0-NEXT: bne LBB6_2 ; CHECK-O0-NEXT: @ %bb.1: @ %cont ; CHECK-O0-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload @@ -610,8 +605,7 @@ define float @caller3(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r8} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller3: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -809,27 +803,26 @@ define float @caller4(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller4: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r8} -; CHECK-O0-NEXT: sub sp, sp, #24 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #24 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: mov r0, #10 -; CHECK-O0-NEXT: str r0, [r7, #-12] -; CHECK-O0-NEXT: mov r0, #11 -; CHECK-O0-NEXT: str r0, [sp, #12] -; CHECK-O0-NEXT: mov r0, #12 -; CHECK-O0-NEXT: str r0, [sp, #8] -; CHECK-O0-NEXT: ldr r0, [r7, #-12] -; CHECK-O0-NEXT: ldr r1, [sp, #12] -; CHECK-O0-NEXT: ldr r2, [sp, #8] -; CHECK-O0-NEXT: bl _foo_vararg -; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: movw r0, #0 -; CHECK-O0-NEXT: cmp r8, r0 -; CHECK-O0-NEXT: bne LBB8_2 +; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: mov r0, #10 +; CHECK-O0-NEXT: str r0, [r7, #-12] +; CHECK-O0-NEXT: mov r0, #11 +; CHECK-O0-NEXT: str r0, [sp, #12] +; CHECK-O0-NEXT: mov r0, #12 +; CHECK-O0-NEXT: str r0, [sp, #8] +; CHECK-O0-NEXT: ldr r0, [r7, #-12] +; CHECK-O0-NEXT: ldr r1, [sp, #12] +; CHECK-O0-NEXT: ldr r2, [sp, #8] +; CHECK-O0-NEXT: bl _foo_vararg +; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: movw r0, #0 +; CHECK-O0-NEXT: cmp r8, r0 +; CHECK-O0-NEXT: bne LBB8_2 ; CHECK-O0-NEXT: @ %bb.1: @ %cont ; CHECK-O0-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload @@ -840,8 +833,7 @@ define float @caller4(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r8} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller4: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -995,14 +987,12 @@ define swiftcc void @swifterror_reg_clobber(ptr nocapture %err) { ; ; CHECK-O0-LABEL: swifterror_reg_clobber: ; CHECK-O0: @ %bb.0: -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r8} +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 ; CHECK-O0-NEXT: @ InlineAsm Start ; CHECK-O0-NEXT: nop ; CHECK-O0-NEXT: @ InlineAsm End -; CHECK-O0-NEXT: pop {r8} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: swifterror_reg_clobber: ; CHECK-ANDROID: @ %bb.0: @@ -1048,36 +1038,34 @@ define swiftcc void @params_in_reg(i32, i32, i32, i32, ptr swiftself, ptr nocapt ; ; CHECK-O0-LABEL: params_in_reg: ; CHECK-O0: @ %bb.0: -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r10} -; CHECK-O0-NEXT: sub sp, sp, #28 -; CHECK-O0-NEXT: bfc sp, #0, #3 -; CHECK-O0-NEXT: str r8, [sp, #20] @ 4-byte Spill -; CHECK-O0-NEXT: str r10, [sp] @ 4-byte Spill -; CHECK-O0-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-O0-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-O0-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: push {r7, r10, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #28 +; CHECK-O0-NEXT: bfc sp, #0, #3 +; CHECK-O0-NEXT: str r8, [sp, #20] @ 4-byte Spill +; CHECK-O0-NEXT: str r10, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-O0-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-O0-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-O0-NEXT: @ implicit-def: $r0 -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: mov r0, #1 -; CHECK-O0-NEXT: mov r1, #2 -; CHECK-O0-NEXT: mov r2, #3 -; CHECK-O0-NEXT: mov r3, #4 -; CHECK-O0-NEXT: mov r10, r8 -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: ldr r10, [sp] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r3, [sp, #16] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #20] @ 4-byte Reload -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r10} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: mov r0, #1 +; CHECK-O0-NEXT: mov r1, #2 +; CHECK-O0-NEXT: mov r2, #3 +; CHECK-O0-NEXT: mov r3, #4 +; CHECK-O0-NEXT: mov r10, r8 +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: ldr r10, [sp] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r3, [sp, #16] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #20] @ 4-byte Reload +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: sub sp, r7, #4 +; CHECK-O0-NEXT: pop {r7, r10, pc} ; ; CHECK-ANDROID-LABEL: params_in_reg: ; CHECK-ANDROID: @ %bb.0: @@ -1165,65 +1153,63 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3 ; ; CHECK-O0-LABEL: params_and_return_in_reg: ; CHECK-O0: @ %bb.0: -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r10} -; CHECK-O0-NEXT: sub sp, sp, #76 -; CHECK-O0-NEXT: bfc sp, #0, #3 -; CHECK-O0-NEXT: str r8, [sp, #24] @ 4-byte Spill -; CHECK-O0-NEXT: str r10, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-O0-NEXT: str r2, [sp, #16] @ 4-byte Spill -; CHECK-O0-NEXT: str r1, [sp, #12] @ 4-byte Spill -; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: push {r7, r10, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #76 +; CHECK-O0-NEXT: bfc sp, #0, #3 +; CHECK-O0-NEXT: str r8, [sp, #24] @ 4-byte Spill +; CHECK-O0-NEXT: str r10, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-O0-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-O0-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-O0-NEXT: @ implicit-def: $r0 -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: str r8, [sp, #28] @ 4-byte Spill -; CHECK-O0-NEXT: mov r0, #1 -; CHECK-O0-NEXT: str r0, [sp, #32] @ 4-byte Spill -; CHECK-O0-NEXT: mov r1, #2 -; CHECK-O0-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-O0-NEXT: mov r2, #3 -; CHECK-O0-NEXT: str r2, [sp, #40] @ 4-byte Spill -; CHECK-O0-NEXT: mov r3, #4 -; CHECK-O0-NEXT: str r3, [sp, #44] @ 4-byte Spill -; CHECK-O0-NEXT: mov r10, r8 -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: ldr r10, [sp, #4] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #24] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #48] @ 4-byte Spill -; CHECK-O0-NEXT: bl _params_and_return_in_reg2 -; CHECK-O0-NEXT: ldr r10, [sp, #28] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r0 -; CHECK-O0-NEXT: ldr r0, [sp, #32] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #52] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r1 -; CHECK-O0-NEXT: ldr r1, [sp, #36] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #56] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r2 -; CHECK-O0-NEXT: ldr r2, [sp, #40] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #60] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r3 -; CHECK-O0-NEXT: ldr r3, [sp, #44] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #64] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #48] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #68] @ 4-byte Spill -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: ldr r0, [sp, #52] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r1, [sp, #56] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r2, [sp, #60] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r3, [sp, #64] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #68] @ 4-byte Reload -; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r10} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: str r8, [sp, #28] @ 4-byte Spill +; CHECK-O0-NEXT: mov r0, #1 +; CHECK-O0-NEXT: str r0, [sp, #32] @ 4-byte Spill +; CHECK-O0-NEXT: mov r1, #2 +; CHECK-O0-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-O0-NEXT: mov r2, #3 +; CHECK-O0-NEXT: str r2, [sp, #40] @ 4-byte Spill +; CHECK-O0-NEXT: mov r3, #4 +; CHECK-O0-NEXT: str r3, [sp, #44] @ 4-byte Spill +; CHECK-O0-NEXT: mov r10, r8 +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: ldr r10, [sp, #4] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #24] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #48] @ 4-byte Spill +; CHECK-O0-NEXT: bl _params_and_return_in_reg2 +; CHECK-O0-NEXT: ldr r10, [sp, #28] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r0 +; CHECK-O0-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #52] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r1 +; CHECK-O0-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #56] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r2 +; CHECK-O0-NEXT: ldr r2, [sp, #40] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #60] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r3 +; CHECK-O0-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #64] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #48] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #68] @ 4-byte Spill +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: ldr r0, [sp, #52] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r1, [sp, #56] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r2, [sp, #60] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r3, [sp, #64] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #68] @ 4-byte Reload +; CHECK-O0-NEXT: sub sp, r7, #4 +; CHECK-O0-NEXT: pop {r7, r10, pc} ; ; CHECK-ANDROID-LABEL: params_and_return_in_reg: ; CHECK-ANDROID: @ %bb.0: @@ -1339,19 +1325,17 @@ define swiftcc ptr @testAssign(ptr %error_ref) { ; ; CHECK-O0-LABEL: testAssign: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, lr} -; CHECK-O0-NEXT: mov r7, sp -; CHECK-O0-NEXT: push {r8} -; CHECK-O0-NEXT: sub sp, sp, #8 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #8 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: bl _foo2 -; CHECK-O0-NEXT: str r8, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: bl _foo2 +; CHECK-O0-NEXT: str r8, [sp] @ 4-byte Spill ; CHECK-O0-NEXT: @ %bb.1: @ %a ; CHECK-O0-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r8} -; CHECK-O0-NEXT: pop {r7, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: testAssign: ; CHECK-ANDROID: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/ARM/v7k-abi-align.ll b/llvm/test/CodeGen/ARM/v7k-abi-align.ll index 20c7aea5dcbe6bbaad9de201b30bc326fb275c45..b27c4354f432a1fe1bc86e784a08702f973dd5fc 100644 --- a/llvm/test/CodeGen/ARM/v7k-abi-align.ll +++ b/llvm/test/CodeGen/ARM/v7k-abi-align.ll @@ -117,7 +117,7 @@ define void @test_dpr_unwind_align_no_dprs() "frame-pointer"="all" { ; 128-bit vectors should use 128-bit (i.e. correctly aligned) slots on ; the stack. -define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) "frame-pointer"="all" { +define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) "frame-pointer"="none" { ; CHECK-LABEL: test_v128_stack_pass: ; CHECK: add r[[ADDR:[0-9]+]], sp, #16 ; CHECK: vld1.64 {d0, d1}, [r[[ADDR]]:128] @@ -140,7 +140,7 @@ define void @test_v128_stack_pass_varargs(<4 x float> %in) "frame-pointer"="all" ; To be compatible with AAPCS's va_start model (store r0-r3 at incoming SP, give ; a single pointer), 64-bit quantities must be pass -define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) "frame-pointer"="all" { +define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) "frame-pointer"="none" { ; CHECK-LABEL: test_64bit_gpr_align: ; CHECK: ldr [[RHS:r[0-9]+]], [sp] ; CHECK: adds r0, [[RHS]], r2 diff --git a/llvm/test/CodeGen/AVR/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AVR/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..18ea60906bd0cd93b950ae803dd2ad2103b4d139 --- /dev/null +++ b/llvm/test/CodeGen/AVR/naked-fn-with-frame-pointer.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple avr | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: ; %bb.0: +; CHECK-NEXT: rcall main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: rcall main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/BPF/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/BPF/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..4e4436296f3b56e427a2f0768440ef44ed50e430 --- /dev/null +++ b/llvm/test/CodeGen/BPF/naked-fn-with-frame-pointer.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple bpfel | FileCheck %s -check-prefixes=CHECK-LE +; RUN: llc < %s -mtriple bpfeb | FileCheck %s -check-prefixes=CHECK-BE + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LE-LABEL: naked: +; CHECK-LE: .Lnaked$local: +; CHECK-LE-NEXT: .type .Lnaked$local,@function +; CHECK-LE-NEXT: .cfi_startproc +; CHECK-LE-NEXT: # %bb.0: +; CHECK-LE-NEXT: call main +; +; CHECK-BE-LABEL: naked: +; CHECK-BE: .Lnaked$local: +; CHECK-BE-NEXT: .type .Lnaked$local,@function +; CHECK-BE-NEXT: .cfi_startproc +; CHECK-BE-NEXT: # %bb.0: +; CHECK-BE-NEXT: call main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LE-LABEL: normal: +; CHECK-LE: .Lnormal$local: +; CHECK-LE-NEXT: .type .Lnormal$local,@function +; CHECK-LE-NEXT: .cfi_startproc +; CHECK-LE-NEXT: # %bb.0: +; CHECK-LE-NEXT: call main +; +; CHECK-BE-LABEL: normal: +; CHECK-BE: .Lnormal$local: +; CHECK-BE-NEXT: .type .Lnormal$local,@function +; CHECK-BE-NEXT: .cfi_startproc +; CHECK-BE-NEXT: # %bb.0: +; CHECK-BE-NEXT: call main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/CSKY/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/CSKY/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..e897127eb31cddc164a2ba6c0f3ef6e18ee2a1a4 --- /dev/null +++ b/llvm/test/CodeGen/CSKY/naked-fn-with-frame-pointer.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple csky | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: # %bb.0: +; CHECK-NEXT: lrw a0, [.LCPI0_0] +; CHECK-NEXT: jsr16 a0 +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: # %bb.0: +; CHECK-NEXT: subi16 sp, sp, 8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: st32.w lr, (sp, 4) # 4-byte Folded Spill +; CHECK-NEXT: st32.w l4, (sp, 0) # 4-byte Folded Spill +; CHECK-NEXT: .cfi_offset lr, -4 +; CHECK-NEXT: .cfi_offset l4, -8 +; CHECK-NEXT: mov16 l4, sp +; CHECK-NEXT: .cfi_def_cfa_register l4 +; CHECK-NEXT: subi16 sp, sp, 4 +; CHECK-NEXT: lrw a0, [.LCPI1_0] +; CHECK-NEXT: jsr16 a0 +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll index d027216e4213d546a124d92d0d8fe5a7d2b5e341..a8d5f9c78f0b4335593245614a0058cece89e03d 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll @@ -9,11 +9,12 @@ target triple = "dxil-pc-shadermodel6.7-library" ; CHECK-NEXT: ; Double-precision extensions for 11.1 ; CHECK-NEXT: ; Note: extra DXIL module flags: ; CHECK-NEXT: {{^;$}} -define double @div(double %a, double %b) { +define double @div(double %a, double %b) #0 { %res = fdiv double %a, %b ret double %res } +attributes #0 = { convergent norecurse nounwind "hlsl.export"} ; DXC: - Name: SFI0 ; DXC-NEXT: Size: 8 diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll index c1a4c219a169515a4deb8cf4b99dff83cd195f64..e9b44240e10b9b5c062ef5b1aa4eb1bf2f5d6e5b 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll @@ -9,11 +9,13 @@ target triple = "dxil-pc-shadermodel6.7-library" ; CHECK-NEXT: ; Note: extra DXIL module flags: ; CHECK-NEXT: {{^;$}} -define double @add(double %a, double %b) { +define double @add(double %a, double %b) #0 { %sum = fadd double %a, %b ret double %sum } +attributes #0 = { convergent norecurse nounwind "hlsl.export"} + ; DXC: - Name: SFI0 ; DXC-NEXT: Size: 8 ; DXC-NEXT: Flags: diff --git a/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll b/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll new file mode 100644 index 0000000000000000000000000000000000000000..8c2a11a3557af3cfdaa5d60d6e067d82abbed940 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll @@ -0,0 +1,35 @@ +; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s + +; Test that for vector values, WaveReadLaneAt scalarizes and maps down to the +; DirectX op + +define noundef <2 x half> @wave_read_lane_v2half(<2 x half> noundef %expr, i32 %idx) { +entry: +; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr.i0, i32 %idx) +; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr.i1, i32 %idx) + %ret = call <2 x half> @llvm.dx.wave.readlane.f16(<2 x half> %expr, i32 %idx) + ret <2 x half> %ret +} + +define noundef <3 x i32> @wave_read_lane_v3i32(<3 x i32> noundef %expr, i32 %idx) { +entry: +; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i0, i32 %idx) +; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i1, i32 %idx) +; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i2, i32 %idx) + %ret = call <3 x i32> @llvm.dx.wave.readlane(<3 x i32> %expr, i32 %idx) + ret <3 x i32> %ret +} + +define noundef <4 x double> @wave_read_lane_v4f64(<4 x double> noundef %expr, i32 %idx) { +entry: +; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i0, i32 %idx) +; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i1, i32 %idx) +; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i2, i32 %idx) +; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i3, i32 %idx) + %ret = call <4 x double> @llvm.dx.wave.readlane(<4 x double> %expr, i32 %idx) + ret <4 x double> %ret +} + +declare <2 x half> @llvm.dx.wave.readlane.v2f16(<2 x half>, i32) +declare <3 x i32> @llvm.dx.wave.readlane.v3i32(<3 x i32>, i32) +declare <4 x double> @llvm.dx.wave.readlane.v4f64(<4 x double>, i32) diff --git a/llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll b/llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll new file mode 100644 index 0000000000000000000000000000000000000000..0024ba66c0cad82274d8e4db57b38ebfcaf59862 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s + +; Test that for scalar values, WaveReadLaneAt maps down to the DirectX op + +define noundef half @wave_rla_half(half noundef %expr, i32 noundef %idx) { +entry: +; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr, i32 %idx) + %ret = call half @llvm.dx.wave.readlane.f16(half %expr, i32 %idx) + ret half %ret +} + +define noundef float @wave_rla_float(float noundef %expr, i32 noundef %idx) { +entry: +; CHECK: call float @dx.op.waveReadLaneAt.f32(i32 117, float %expr, i32 %idx) + %ret = call float @llvm.dx.wave.readlane(float %expr, i32 %idx) + ret float %ret +} + +define noundef double @wave_rla_double(double noundef %expr, i32 noundef %idx) { +entry: +; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr, i32 %idx) + %ret = call double @llvm.dx.wave.readlane(double %expr, i32 %idx) + ret double %ret +} + +define noundef i1 @wave_rla_i1(i1 noundef %expr, i32 noundef %idx) { +entry: +; CHECK: call i1 @dx.op.waveReadLaneAt.i1(i32 117, i1 %expr, i32 %idx) + %ret = call i1 @llvm.dx.wave.readlane.i1(i1 %expr, i32 %idx) + ret i1 %ret +} + +define noundef i16 @wave_rla_i16(i16 noundef %expr, i32 noundef %idx) { +entry: +; CHECK: call i16 @dx.op.waveReadLaneAt.i16(i32 117, i16 %expr, i32 %idx) + %ret = call i16 @llvm.dx.wave.readlane.i16(i16 %expr, i32 %idx) + ret i16 %ret +} + +define noundef i32 @wave_rla_i32(i32 noundef %expr, i32 noundef %idx) { +entry: +; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr, i32 %idx) + %ret = call i32 @llvm.dx.wave.readlane.i32(i32 %expr, i32 %idx) + ret i32 %ret +} + +define noundef i64 @wave_rla_i64(i64 noundef %expr, i32 noundef %idx) { +entry: +; CHECK: call i64 @dx.op.waveReadLaneAt.i64(i32 117, i64 %expr, i32 %idx) + %ret = call i64 @llvm.dx.wave.readlane.i64(i64 %expr, i32 %idx) + ret i64 %ret +} + +declare half @llvm.dx.wave.readlane.f16(half, i32) +declare float @llvm.dx.wave.readlane.f32(float, i32) +declare double @llvm.dx.wave.readlane.f64(double, i32) + +declare i1 @llvm.dx.wave.readlane.i1(i1, i32) +declare i16 @llvm.dx.wave.readlane.i16(i16, i32) +declare i32 @llvm.dx.wave.readlane.i32(i32, i32) +declare i64 @llvm.dx.wave.readlane.i64(i64, i32) diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll index 9d86f87f3ed50eb5d383755adfea87707f1f291b..b2c650d1162655ceac2b0d22cf1bcab6e2bf5b46 100644 --- a/llvm/test/CodeGen/DirectX/atan2.ll +++ b/llvm/test/CodeGen/DirectX/atan2.ll @@ -1,87 +1,87 @@ -; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK -; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK - -; Make sure correct dxil expansions for atan2 are generated for float and half. - -define noundef float @atan2_float(float noundef %y, float noundef %x) { -entry: -; CHECK: [[DIV:%.+]] = fdiv float %y, %x -; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]]) -; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]]) -; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000 -; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000 -; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00 -; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 -; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 -; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00 -; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] -; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]] -; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] -; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]] -; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] -; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]] -; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] -; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]] -; CHECK: ret float [[SELECT_HPI]] - %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x) - ret float %elt.atan2 -} - -define noundef half @atan2_half(half noundef %y, half noundef %x) { -entry: -; CHECK: [[DIV:%.+]] = fdiv half %y, %x -; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]]) -; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]]) -; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248 -; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248 -; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000 -; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 -; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 -; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000 -; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] -; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]] -; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] -; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]] -; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] -; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]] -; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] -; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]] -; CHECK: ret half [[SELECT_HPI]] - %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x) - ret half %elt.atan2 -} - -define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) { -entry: -; Just Expansion, no scalarization or lowering: -; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x -; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]]) -; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], -; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], -; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer -; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer -; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer -; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer -; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]] -; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]] -; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]] -; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]] -; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]] -; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> , <4 x float> [[SELECT_SUB_PI]] -; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]] -; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> , <4 x float> [[SELECT_NEGHPI]] -; EXPCHECK: ret <4 x float> [[SELECT_HPI]] - -; Scalarization occurs after expansion, so atan scalarization is tested separately. -; Expansion, scalarization and lowering: -; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls. -; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}}) -; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17, - - %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x) - ret <4 x float> %elt.atan2 -} - -declare half @llvm.atan2.f16(half, half) -declare float @llvm.atan2.f32(float, float) -declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) +; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK +; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK + +; Make sure correct dxil expansions for atan2 are generated for float and half. + +define noundef float @atan2_float(float noundef %y, float noundef %x) { +entry: +; CHECK: [[DIV:%.+]] = fdiv float %y, %x +; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]]) +; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]]) +; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000 +; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000 +; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00 +; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 +; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 +; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00 +; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] +; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]] +; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] +; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]] +; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] +; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]] +; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] +; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]] +; CHECK: ret float [[SELECT_HPI]] + %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x) + ret float %elt.atan2 +} + +define noundef half @atan2_half(half noundef %y, half noundef %x) { +entry: +; CHECK: [[DIV:%.+]] = fdiv half %y, %x +; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]]) +; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]]) +; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248 +; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248 +; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000 +; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 +; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 +; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000 +; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] +; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]] +; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] +; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]] +; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] +; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]] +; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] +; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]] +; CHECK: ret half [[SELECT_HPI]] + %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x) + ret half %elt.atan2 +} + +define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) { +entry: +; Just Expansion, no scalarization or lowering: +; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x +; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]]) +; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], +; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], +; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer +; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer +; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer +; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer +; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]] +; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]] +; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]] +; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]] +; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]] +; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> , <4 x float> [[SELECT_SUB_PI]] +; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]] +; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> , <4 x float> [[SELECT_NEGHPI]] +; EXPCHECK: ret <4 x float> [[SELECT_HPI]] + +; Scalarization occurs after expansion, so atan scalarization is tested separately. +; Expansion, scalarization and lowering: +; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls. +; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}}) +; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17, + + %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x) + ret <4 x float> %elt.atan2 +} + +declare half @llvm.atan2.f16(half, half) +declare float @llvm.atan2.f32(float, float) +declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/DirectX/atan2_error.ll b/llvm/test/CodeGen/DirectX/atan2_error.ll index 372934098b7cab600d87ca5e2357067a55e741dc..9b66f9f1dd45a70b883b6f77bea0547ac3b4a4cf 100644 --- a/llvm/test/CodeGen/DirectX/atan2_error.ll +++ b/llvm/test/CodeGen/DirectX/atan2_error.ll @@ -1,11 +1,11 @@ -; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s - -; DXIL operation atan does not support double overload type -; CHECK: in function atan2_double -; CHECK-SAME: Cannot create ATan operation: Invalid overload type - -define noundef double @atan2_double(double noundef %a, double noundef %b) #0 { -entry: - %1 = call double @llvm.atan2.f64(double %a, double %b) - ret double %1 -} +; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s + +; DXIL operation atan does not support double overload type +; CHECK: in function atan2_double +; CHECK-SAME: Cannot create ATan operation: Invalid overload type + +define noundef double @atan2_double(double noundef %a, double noundef %b) #0 { +entry: + %1 = call double @llvm.atan2.f64(double %a, double %b) + ret double %1 +} diff --git a/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll b/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll index 8f5d3ae8641717f6fe102a43ebfe0ab29b245d54..39e21daceea81ac2f898d63bb81d5d83c6f36681 100644 --- a/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll +++ b/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll @@ -1,25 +1,27 @@ ; RUN: llc --filetype=asm %s -o - | FileCheck %s target triple = "dxil-unknown-shadermodel6.7-library" -define i64 @test(ptr %p) { +define i64 @test(ptr %p) #0 { store i32 0, ptr %p %v = load i64, ptr %p ret i64 %v } -; CHECK: define internal i64 @test(ptr %p) { +; CHECK: define i64 @test(ptr %p) #0 { ; CHECK-NEXT: %1 = bitcast ptr %p to ptr ; CHECK-NEXT: store i32 0, ptr %1, align 4 ; CHECK-NEXT: %2 = bitcast ptr %p to ptr ; CHECK-NEXT: %3 = load i64, ptr %2, align 8 -define i64 @testGEP(ptr %p) { +define i64 @testGEP(ptr %p) #0 { %ptr = getelementptr i32, ptr %p, i32 4 %val = load i64, ptr %p ret i64 %val } -; CHECK: define internal i64 @testGEP(ptr %p) { +attributes #0 = { convergent norecurse nounwind "hlsl.export"} + +; CHECK: define i64 @testGEP(ptr %p) #0 { ; CHECK-NEXT: %1 = bitcast ptr %p to ptr ; CHECK-NEXT: %ptr = getelementptr i32, ptr %1, i32 4 ; CHECK-NEXT: %2 = bitcast ptr %p to ptr diff --git a/llvm/test/CodeGen/DirectX/cross.ll b/llvm/test/CodeGen/DirectX/cross.ll index 6ec3ec4d3594af99a2f58f4b60c1034cfbf9db39..6153cf7cddc9d539c46e00de461e819168657018 100644 --- a/llvm/test/CodeGen/DirectX/cross.ll +++ b/llvm/test/CodeGen/DirectX/cross.ll @@ -1,56 +1,56 @@ -; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s - -; Make sure dxil operation function calls for cross are generated for half/float. - -declare <3 x half> @llvm.dx.cross.v3f16(<3 x half>, <3 x half>) -declare <3 x float> @llvm.dx.cross.v3f32(<3 x float>, <3 x float>) - -define noundef <3 x half> @test_cross_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { -entry: - ; CHECK: %x0 = extractelement <3 x half> %p0, i64 0 - ; CHECK: %x1 = extractelement <3 x half> %p0, i64 1 - ; CHECK: %x2 = extractelement <3 x half> %p0, i64 2 - ; CHECK: %y0 = extractelement <3 x half> %p1, i64 0 - ; CHECK: %y1 = extractelement <3 x half> %p1, i64 1 - ; CHECK: %y2 = extractelement <3 x half> %p1, i64 2 - ; CHECK: %0 = fmul half %x1, %y2 - ; CHECK: %1 = fmul half %x2, %y1 - ; CHECK: %hlsl.cross1 = fsub half %0, %1 - ; CHECK: %2 = fmul half %x2, %y0 - ; CHECK: %3 = fmul half %x0, %y2 - ; CHECK: %hlsl.cross2 = fsub half %2, %3 - ; CHECK: %4 = fmul half %x0, %y1 - ; CHECK: %5 = fmul half %x1, %y0 - ; CHECK: %hlsl.cross3 = fsub half %4, %5 - ; CHECK: %6 = insertelement <3 x half> undef, half %hlsl.cross1, i64 0 - ; CHECK: %7 = insertelement <3 x half> %6, half %hlsl.cross2, i64 1 - ; CHECK: %8 = insertelement <3 x half> %7, half %hlsl.cross3, i64 2 - ; CHECK: ret <3 x half> %8 - %hlsl.cross = call <3 x half> @llvm.dx.cross.v3f16(<3 x half> %p0, <3 x half> %p1) - ret <3 x half> %hlsl.cross -} - -define noundef <3 x float> @test_cross_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { -entry: - ; CHECK: %x0 = extractelement <3 x float> %p0, i64 0 - ; CHECK: %x1 = extractelement <3 x float> %p0, i64 1 - ; CHECK: %x2 = extractelement <3 x float> %p0, i64 2 - ; CHECK: %y0 = extractelement <3 x float> %p1, i64 0 - ; CHECK: %y1 = extractelement <3 x float> %p1, i64 1 - ; CHECK: %y2 = extractelement <3 x float> %p1, i64 2 - ; CHECK: %0 = fmul float %x1, %y2 - ; CHECK: %1 = fmul float %x2, %y1 - ; CHECK: %hlsl.cross1 = fsub float %0, %1 - ; CHECK: %2 = fmul float %x2, %y0 - ; CHECK: %3 = fmul float %x0, %y2 - ; CHECK: %hlsl.cross2 = fsub float %2, %3 - ; CHECK: %4 = fmul float %x0, %y1 - ; CHECK: %5 = fmul float %x1, %y0 - ; CHECK: %hlsl.cross3 = fsub float %4, %5 - ; CHECK: %6 = insertelement <3 x float> undef, float %hlsl.cross1, i64 0 - ; CHECK: %7 = insertelement <3 x float> %6, float %hlsl.cross2, i64 1 - ; CHECK: %8 = insertelement <3 x float> %7, float %hlsl.cross3, i64 2 - ; CHECK: ret <3 x float> %8 - %hlsl.cross = call <3 x float> @llvm.dx.cross.v3f32(<3 x float> %p0, <3 x float> %p1) - ret <3 x float> %hlsl.cross -} +; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s + +; Make sure dxil operation function calls for cross are generated for half/float. + +declare <3 x half> @llvm.dx.cross.v3f16(<3 x half>, <3 x half>) +declare <3 x float> @llvm.dx.cross.v3f32(<3 x float>, <3 x float>) + +define noundef <3 x half> @test_cross_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { +entry: + ; CHECK: %x0 = extractelement <3 x half> %p0, i64 0 + ; CHECK: %x1 = extractelement <3 x half> %p0, i64 1 + ; CHECK: %x2 = extractelement <3 x half> %p0, i64 2 + ; CHECK: %y0 = extractelement <3 x half> %p1, i64 0 + ; CHECK: %y1 = extractelement <3 x half> %p1, i64 1 + ; CHECK: %y2 = extractelement <3 x half> %p1, i64 2 + ; CHECK: %0 = fmul half %x1, %y2 + ; CHECK: %1 = fmul half %x2, %y1 + ; CHECK: %hlsl.cross1 = fsub half %0, %1 + ; CHECK: %2 = fmul half %x2, %y0 + ; CHECK: %3 = fmul half %x0, %y2 + ; CHECK: %hlsl.cross2 = fsub half %2, %3 + ; CHECK: %4 = fmul half %x0, %y1 + ; CHECK: %5 = fmul half %x1, %y0 + ; CHECK: %hlsl.cross3 = fsub half %4, %5 + ; CHECK: %6 = insertelement <3 x half> undef, half %hlsl.cross1, i64 0 + ; CHECK: %7 = insertelement <3 x half> %6, half %hlsl.cross2, i64 1 + ; CHECK: %8 = insertelement <3 x half> %7, half %hlsl.cross3, i64 2 + ; CHECK: ret <3 x half> %8 + %hlsl.cross = call <3 x half> @llvm.dx.cross.v3f16(<3 x half> %p0, <3 x half> %p1) + ret <3 x half> %hlsl.cross +} + +define noundef <3 x float> @test_cross_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { +entry: + ; CHECK: %x0 = extractelement <3 x float> %p0, i64 0 + ; CHECK: %x1 = extractelement <3 x float> %p0, i64 1 + ; CHECK: %x2 = extractelement <3 x float> %p0, i64 2 + ; CHECK: %y0 = extractelement <3 x float> %p1, i64 0 + ; CHECK: %y1 = extractelement <3 x float> %p1, i64 1 + ; CHECK: %y2 = extractelement <3 x float> %p1, i64 2 + ; CHECK: %0 = fmul float %x1, %y2 + ; CHECK: %1 = fmul float %x2, %y1 + ; CHECK: %hlsl.cross1 = fsub float %0, %1 + ; CHECK: %2 = fmul float %x2, %y0 + ; CHECK: %3 = fmul float %x0, %y2 + ; CHECK: %hlsl.cross2 = fsub float %2, %3 + ; CHECK: %4 = fmul float %x0, %y1 + ; CHECK: %5 = fmul float %x1, %y0 + ; CHECK: %hlsl.cross3 = fsub float %4, %5 + ; CHECK: %6 = insertelement <3 x float> undef, float %hlsl.cross1, i64 0 + ; CHECK: %7 = insertelement <3 x float> %6, float %hlsl.cross2, i64 1 + ; CHECK: %8 = insertelement <3 x float> %7, float %hlsl.cross3, i64 2 + ; CHECK: ret <3 x float> %8 + %hlsl.cross = call <3 x float> @llvm.dx.cross.v3f32(<3 x float> %p0, <3 x float> %p1) + ret <3 x float> %hlsl.cross +} diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll new file mode 100644 index 0000000000000000000000000000000000000000..202609c8156a72326768b6c550d18d2fd83ccba0 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll @@ -0,0 +1,222 @@ +; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-library %s | FileCheck %s +; RUN: llc %s --filetype=asm -o - | FileCheck %s + +target triple = "dxilv1.5-pc-shadermodel6.5-compute" + +; Confirm that DXILFinalizeLinkage will remove functions that have compatible +; linkage and are not called from anywhere. This should be any function that +; is not explicitly marked export and is not an entry point. + +; Has no specified inlining/linking behavior and is uncalled, this should be removed. +; CHECK-NOT: define {{.*}}doNothingUncalled +define void @"?doNothingUncalled@@YAXXZ"() #2 { +entry: + ret void +} + +; Alwaysinline and uncalled, this should be removed. +; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled +define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline and uncalled, this should be removed. +; CHECK-NOT: define {{.*}}doNoinlineUncalled +define void @"?doNoinlineUncalled@@YAXXZ"() #4 { +entry: + ret void +} + +; No inlining attribute, internal, and uncalled; this should be removed. +; CHECK-NOT: define {{.*}}doInternalUncalled +define internal void @"?doInternalUncalled@@YAXXZ"() #2 { +entry: + ret void +} + +; Alwaysinline, internal, and uncalled; this should be removed. +; CHECK-NOT: define {{.*}}doAlwaysInlineInternalUncalled +define internal void @"?doAlwaysInlineInternalUncalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, internal, and uncalled; this should be removed. +; CHECK-NOT: define {{.*}}doNoinlineInternalUncalled +define internal void @"?doNoinlineInternalUncalled@@YAXXZ"() #4 { +entry: + ret void +} + +; Marked external and uncalled, this should become internal and be removed. +; CHECK-NOT: define {{.*}}doExternalUncalled +define external void @"?doExternalUncalled@@YAXXZ"() #2 { +entry: + ret void +} + +; Alwaysinline, external and uncalled, this should become internal and be removed. +; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled +define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, external and uncalled, this should become internal and be removed. +; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled +define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 { +entry: + ret void +} + +; No inlining attribute and called, this should stay. +; CHECK: define {{.*}}doNothingCalled +define void @"?doNothingCalled@@YAXXZ"() #2 { +entry: + ret void +} + +; Alwaysinline and called, this should stay. +; CHECK: define {{.*}}doAlwaysInlineCalled +define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline and called, this should stay. +; CHECK: define {{.*}}doNoinlineCalled +define void @"?doNoinlineCalled@@YAXXZ"() #4 { +entry: + ret void +} + +; No inlining attribute, internal, and called; this should stay. +; CHECK: define {{.*}}doInternalCalled +define internal void @"?doInternalCalled@@YAXXZ"() #2 { +entry: + ret void +} + +; Alwaysinline, internal, and called; this should stay. +; CHECK: define {{.*}}doAlwaysInlineInternalCalled +define internal void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, internal, and called; this should stay. +; CHECK: define {{.*}}doNoinlineInternalCalled +define internal void @"?doNoinlineInternalCalled@@YAXXZ"() #4 { +entry: + ret void +} + +; Marked external and called, this should become internal and stay. +; CHECK: define {{.*}}doExternalCalled +define external void @"?doExternalCalled@@YAXXZ"() #2 { +entry: + ret void +} + +; Always inlined, external and called, this should become internal and stay. +; CHECK: define {{.*}}doAlwaysInlineExternalCalled +define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, external and called, this should become internal and stay. +; CHECK: define {{.*}}doNoinlineExternalCalled +define external void @"?doNoinlineExternalCalled@@YAXXZ"() #4 { +entry: + ret void +} + +; No inlining attribute and exported, this should stay. +; CHECK: define {{.*}}doNothingExported +define void @"?doNothingExported@@YAXXZ"() #3 { +entry: + ret void +} + +; Alwaysinline and exported, this should stay. +; CHECK: define {{.*}}doAlwaysInlineExported +define void @"?doAlwaysInlineExported@@YAXXZ"() #1 { +entry: + ret void +} + +; Noinline attribute and exported, this should stay. +; CHECK: define {{.*}}doNoinlineExported +define void @"?doNoinlineExported@@YAXXZ"() #5 { +entry: + ret void +} + +; No inlining attribute, internal, and exported; this should stay. +; CHECK: define {{.*}}doInternalExported +define internal void @"?doInternalExported@@YAXXZ"() #3 { +entry: + ret void +} + +; Alwaysinline, internal, and exported; this should stay. +; CHECK: define {{.*}}doAlwaysInlineInternalExported +define internal void @"?doAlwaysInlineInternalExported@@YAXXZ"() #1 { +entry: + ret void +} + +; Noinline, internal, and exported; this should stay. +; CHECK: define {{.*}}doNoinlineInternalExported +define internal void @"?doNoinlineInternalExported@@YAXXZ"() #5 { +entry: + ret void +} + +; Marked external and exported, this should stay. +; CHECK: define {{.*}}doExternalExported +define external void @"?doExternalExported@@YAXXZ"() #3 { +entry: + ret void +} + +; Alwaysinline, external and exported, this should stay. +; CHECK: define {{.*}}doAlwaysInlineExternalExported +define external void @"?doAlwaysInlineExternalExported@@YAXXZ"() #1 { +entry: + ret void +} + +; Noinline, external and exported, this should stay. +; CHECK: define {{.*}}doNoinlineExternalExported +define external void @"?doNoinlineExternalExported@@YAXXZ"() #5 { +entry: + ret void +} + +; Entry point function, this should stay. +; CHECK: define void @main() +define void @main() #6 { +entry: + call void @"?doNothingCalled@@YAXXZ"() #7 + call void @"?doAlwaysInlineCalled@@YAXXZ"() #7 + call void @"?doNoinlineCalled@@YAXXZ"() #7 + call void @"?doInternalCalled@@YAXXZ"() #7 + call void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #7 + call void @"?doNoinlineInternalCalled@@YAXXZ"() #7 + call void @"?doExternalCalled@@YAXXZ"() #7 + call void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #7 + call void @"?doNoinlineExternalCalled@@YAXXZ"() #7 + ret void +} + +attributes #0 = { alwaysinline convergent norecurse nounwind } +attributes #1 = { alwaysinline convergent norecurse nounwind "hlsl.export"} +attributes #2 = { convergent norecurse nounwind } +attributes #3 = { convergent norecurse nounwind "hlsl.export"} +attributes #4 = { convergent noinline norecurse nounwind } +attributes #5 = { convergent noinline norecurse nounwind "hlsl.export"} +attributes #6 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } +attributes #7 = { convergent } diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll new file mode 100644 index 0000000000000000000000000000000000000000..49c3bda621d74e448f62d3d9afd1bbc31d4f8e7b --- /dev/null +++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll @@ -0,0 +1,156 @@ +; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-compute %s | FileCheck %s +; RUN: llc %s --filetype=asm -o - | FileCheck %s + +target triple = "dxilv1.5-pc-shadermodel6.5-compute" + +; Confirm that DXILFinalizeLinkage will remove functions that have compatible +; linkage and are not called from anywhere. This should be any function that +; is not an entry point. + +; Has no specified inlining/linking behavior and is uncalled, this should be removed. +; CHECK-NOT: define {{.*}}doNothingUncalled +define void @"?doNothingUncalled@@YAXXZ"() #1 { +entry: + ret void +} + +; Alwaysinline and uncalled, this should be removed. +; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled +define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline and uncalled, this should be removed. +; CHECK-NOT: define {{.*}}doNoinlineUncalled +define void @"?doNoinlineUncalled@@YAXXZ"() #3 { +entry: + ret void +} + +; No inlining attribute, internal, and uncalled; this should be removed. +; CHECK-NOT: define {{.*}}doInternalUncalled +define internal void @"?doInternalUncalled@@YAXXZ"() #1 { +entry: + ret void +} + +; Alwaysinline, internal, and uncalled; this should be removed. +; CHECK-NOT: define {{.*}}doAlwaysInlineInternalUncalled +define internal void @"?doAlwaysInlineInternalUncalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, internal, and uncalled; this should be removed. +; CHECK-NOT: define {{.*}}doNoinlineInternalUncalled +define internal void @"?doNoinlineInternalUncalled@@YAXXZ"() #3 { +entry: + ret void +} + +; Marked external and uncalled, this should become internal and be removed. +; CHECK-NOT: define {{.*}}doExternalUncalled +define external void @"?doExternalUncalled@@YAXXZ"() #1 { +entry: + ret void +} + +; Alwaysinline, external and uncalled, this should become internal and be removed. +; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled +define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, external and uncalled, this should become internal and be removed. +; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled +define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 { +entry: + ret void +} + +; No inlining attribute and called, this should stay. +; CHECK: define {{.*}}doNothingCalled +define void @"?doNothingCalled@@YAXXZ"() #1 { +entry: + ret void +} + +; Alwaysinline and called, this should stay. +; CHECK: define {{.*}}doAlwaysInlineCalled +define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline and called, this should stay. +; CHECK: define {{.*}}doNoinlineCalled +define void @"?doNoinlineCalled@@YAXXZ"() #3 { +entry: + ret void +} + +; No inlining attribute, internal, and called; this should stay. +; CHECK: define {{.*}}doInternalCalled +define internal void @"?doInternalCalled@@YAXXZ"() #1 { +entry: + ret void +} + +; Alwaysinline, internal, and called; this should stay. +; CHECK: define {{.*}}doAlwaysInlineInternalCalled +define internal void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, internal, and called; this should stay. +; CHECK: define {{.*}}doNoinlineInternalCalled +define internal void @"?doNoinlineInternalCalled@@YAXXZ"() #3 { +entry: + ret void +} + +; Marked external and called, this should become internal and stay. +; CHECK: define {{.*}}doExternalCalled +define external void @"?doExternalCalled@@YAXXZ"() #1 { +entry: + ret void +} + +; Always inlined, external and called, this should become internal and stay. +; CHECK: define {{.*}}doAlwaysInlineExternalCalled +define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 { +entry: + ret void +} + +; Noinline, external and called, this should become internal and stay. +; CHECK: define {{.*}}doNoinlineExternalCalled +define external void @"?doNoinlineExternalCalled@@YAXXZ"() #3 { +entry: + ret void +} + +; Entry point function, this should stay. +; CHECK: define void @main() +define void @main() #4 { +entry: + call void @"?doNothingCalled@@YAXXZ"() #5 + call void @"?doAlwaysInlineCalled@@YAXXZ"() #5 + call void @"?doNoinlineCalled@@YAXXZ"() #5 + call void @"?doInternalCalled@@YAXXZ"() #5 + call void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #5 + call void @"?doNoinlineInternalCalled@@YAXXZ"() #5 + call void @"?doExternalCalled@@YAXXZ"() #5 + call void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #5 + call void @"?doNoinlineExternalCalled@@YAXXZ"() #5 + ret void +} + +attributes #0 = { alwaysinline convergent norecurse nounwind } +attributes #1 = { convergent norecurse nounwind } +attributes #3 = { convergent noinline norecurse nounwind } +attributes #4 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } +attributes #5 = { convergent } diff --git a/llvm/test/CodeGen/DirectX/finalize_linkage.ll b/llvm/test/CodeGen/DirectX/finalize_linkage.ll index 0ee8a5f44593ba1eec510ed84f56deaf074b58cd..c761a79a5c28a5ee9889fb5f69dff9e79edc6d4f 100644 --- a/llvm/test/CodeGen/DirectX/finalize_linkage.ll +++ b/llvm/test/CodeGen/DirectX/finalize_linkage.ll @@ -1,64 +1,64 @@ -; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-compute %s | FileCheck %s -; RUN: llc %s --filetype=asm -o - | FileCheck %s --check-prefixes=CHECK-LLC - -target triple = "dxilv1.5-pc-shadermodel6.5-compute" - -; DXILFinalizeLinkage changes linkage of all functions that are not -; entry points or exported function to internal. - -; CHECK: define internal void @"?f1@@YAXXZ"() -define void @"?f1@@YAXXZ"() #0 { -entry: - ret void -} - -; CHECK: define internal void @"?f2@@YAXXZ"() -define void @"?f2@@YAXXZ"() #0 { -entry: - ret void -} - -; CHECK: define internal void @"?f3@@YAXXZ"() -define void @"?f3@@YAXXZ"() #0 { -entry: - ret void -} - -; CHECK: define internal void @"?foo@@YAXXZ"() -define void @"?foo@@YAXXZ"() #0 { -entry: - call void @"?f2@@YAXXZ"() #3 - ret void -} - -; Exported function - do not change linkage -; CHECK: define void @"?bar@@YAXXZ"() -define void @"?bar@@YAXXZ"() #1 { -entry: - call void @"?f3@@YAXXZ"() #3 - ret void -} - -; CHECK: define internal void @"?main@@YAXXZ"() #0 -define internal void @"?main@@YAXXZ"() #0 { -entry: - call void @"?foo@@YAXXZ"() #3 - call void @"?bar@@YAXXZ"() #3 - ret void -} - -; Entry point function - do not change linkage -; CHECK: define void @main() #2 -define void @main() #2 { -entry: - call void @"?main@@YAXXZ"() - ret void -} - -attributes #0 = { convergent noinline nounwind optnone} -attributes #1 = { convergent noinline nounwind optnone "hlsl.export"} -attributes #2 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"} -attributes #3 = { convergent } - -; Make sure "hlsl.export" attribute is stripped by llc -; CHECK-LLC-NOT: "hlsl.export" +; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-compute %s | FileCheck %s +; RUN: llc %s --filetype=asm -o - | FileCheck %s --check-prefixes=CHECK-LLC + +target triple = "dxilv1.5-pc-shadermodel6.5-compute" + +; DXILFinalizeLinkage changes linkage of all functions that are not +; entry points or exported function to internal. + +; CHECK-NOT: define internal void @"?f1@@YAXXZ"() +define void @"?f1@@YAXXZ"() #0 { +entry: + ret void +} + +; CHECK: define internal void @"?f2@@YAXXZ"() +define void @"?f2@@YAXXZ"() #0 { +entry: + ret void +} + +; CHECK: define internal void @"?f3@@YAXXZ"() +define void @"?f3@@YAXXZ"() #0 { +entry: + ret void +} + +; CHECK: define internal void @"?foo@@YAXXZ"() +define void @"?foo@@YAXXZ"() #0 { +entry: + call void @"?f2@@YAXXZ"() #3 + ret void +} + +; Exported function - do not change linkage +; CHECK: define void @"?bar@@YAXXZ"() +define void @"?bar@@YAXXZ"() #1 { +entry: + call void @"?f3@@YAXXZ"() #3 + ret void +} + +; CHECK: define internal void @"?main@@YAXXZ"() #0 +define internal void @"?main@@YAXXZ"() #0 { +entry: + call void @"?foo@@YAXXZ"() #3 + call void @"?bar@@YAXXZ"() #3 + ret void +} + +; Entry point function - do not change linkage +; CHECK: define void @main() #2 +define void @main() #2 { +entry: + call void @"?main@@YAXXZ"() + ret void +} + +attributes #0 = { convergent noinline nounwind optnone} +attributes #1 = { convergent noinline nounwind optnone "hlsl.export"} +attributes #2 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"} +attributes #3 = { convergent } + +; Make sure "hlsl.export" attribute is stripped by llc +; CHECK-LLC-NOT: "hlsl.export" diff --git a/llvm/test/CodeGen/DirectX/fneg-conversion.ll b/llvm/test/CodeGen/DirectX/fneg-conversion.ll index a397c18398c5f77ab2495de515392ba511e207e7..3acf4790de4b10750ed7b4ab12a7d1b81b515083 100644 --- a/llvm/test/CodeGen/DirectX/fneg-conversion.ll +++ b/llvm/test/CodeGen/DirectX/fneg-conversion.ll @@ -1,14 +1,16 @@ ; RUN: llc %s --filetype=asm -o - | FileCheck %s target triple = "dxil-unknown-shadermodel6.7-library" -define float @negateF(float %0) { +define float @negateF(float %0) #0 { ; CHECK: %2 = fsub float -0.000000e+00, %0 %2 = fneg float %0 ret float %2 } -define double @negateD(double %0) { +define double @negateD(double %0) #0 { ; CHECK: %2 = fsub double -0.000000e+00, %0 %2 = fneg double %0 ret double %2 } + +attributes #0 = { convergent norecurse nounwind "hlsl.export"} \ No newline at end of file diff --git a/llvm/test/CodeGen/DirectX/normalize.ll b/llvm/test/CodeGen/DirectX/normalize.ll index 2aba9d5f74d78ef65f2dcca3ae109d5406062dcf..de106be12437125054e60c4304d3e7fc8122da81 100644 --- a/llvm/test/CodeGen/DirectX/normalize.ll +++ b/llvm/test/CodeGen/DirectX/normalize.ll @@ -1,112 +1,112 @@ -; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK -; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK - -; Make sure dxil operation function calls for normalize are generated for half/float. - -declare half @llvm.dx.normalize.f16(half) -declare <2 x half> @llvm.dx.normalize.v2f16(<2 x half>) -declare <3 x half> @llvm.dx.normalize.v3f16(<3 x half>) -declare <4 x half> @llvm.dx.normalize.v4f16(<4 x half>) - -declare float @llvm.dx.normalize.f32(float) -declare <2 x float> @llvm.dx.normalize.v2f32(<2 x float>) -declare <3 x float> @llvm.dx.normalize.v3f32(<3 x float>) -declare <4 x float> @llvm.dx.normalize.v4f32(<4 x float>) - -define noundef half @test_normalize_half(half noundef %p0) { -entry: - ; CHECK: fdiv half %p0, %p0 - %hlsl.normalize = call half @llvm.dx.normalize.f16(half %p0) - ret half %hlsl.normalize -} - -define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) { -entry: - ; EXPCHECK: [[doth2:%.*]] = call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) - ; DOPCHECK: [[doth2:%.*]] = call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth2]]) - ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth2]]) - ; CHECK: [[splatinserth2:%.*]] = insertelement <2 x half> poison, half [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] = shufflevector <2 x half> [[splatinserth2]], <2 x half> poison, <2 x i32> zeroinitializer - ; CHECK: fmul <2 x half> %p0, [[splat]] - - %hlsl.normalize = call <2 x half> @llvm.dx.normalize.v2f16(<2 x half> %p0) - ret <2 x half> %hlsl.normalize -} - -define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) { -entry: - ; EXPCHECK: [[doth3:%.*]] = call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}) - ; DOPCHECK: [[doth3:%.*]] = call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth3]]) - ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth3]]) - ; CHECK: [[splatinserth3:%.*]] = insertelement <3 x half> poison, half [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <3 x half> [[splatinserth3]], <3 x half> poison, <3 x i32> zeroinitializer - ; CHECK: fmul <3 x half> %p0, %.splat - - %hlsl.normalize = call <3 x half> @llvm.dx.normalize.v3f16(<3 x half> %p0) - ret <3 x half> %hlsl.normalize -} - -define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) { -entry: - ; EXPCHECK: [[doth4:%.*]] = call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}) - ; DOPCHECK: [[doth4:%.*]] = call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth4]]) - ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth4]]) - ; CHECK: [[splatinserth4:%.*]] = insertelement <4 x half> poison, half [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <4 x half> [[splatinserth4]], <4 x half> poison, <4 x i32> zeroinitializer - ; CHECK: fmul <4 x half> %p0, %.splat - - %hlsl.normalize = call <4 x half> @llvm.dx.normalize.v4f16(<4 x half> %p0) - ret <4 x half> %hlsl.normalize -} - -define noundef float @test_normalize_float(float noundef %p0) { -entry: - ; CHECK: fdiv float %p0, %p0 - %hlsl.normalize = call float @llvm.dx.normalize.f32(float %p0) - ret float %hlsl.normalize -} - -define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) { -entry: - ; EXPCHECK: [[dotf2:%.*]] = call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}) - ; DOPCHECK: [[dotf2:%.*]] = call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf2]]) - ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf2]]) - ; CHECK: [[splatinsertf2:%.*]] = insertelement <2 x float> poison, float [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <2 x float> [[splatinsertf2]], <2 x float> poison, <2 x i32> zeroinitializer - ; CHECK: fmul <2 x float> %p0, %.splat - - %hlsl.normalize = call <2 x float> @llvm.dx.normalize.v2f32(<2 x float> %p0) - ret <2 x float> %hlsl.normalize -} - -define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) { -entry: - ; EXPCHECK: [[dotf3:%.*]] = call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}) - ; DOPCHECK: [[dotf3:%.*]] = call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf3]]) - ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf3]]) - ; CHECK: [[splatinsertf3:%.*]] = insertelement <3 x float> poison, float [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <3 x float> [[splatinsertf3]], <3 x float> poison, <3 x i32> zeroinitializer - ; CHECK: fmul <3 x float> %p0, %.splat - - %hlsl.normalize = call <3 x float> @llvm.dx.normalize.v3f32(<3 x float> %p0) - ret <3 x float> %hlsl.normalize -} - -define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) { -entry: - ; EXPCHECK: [[dotf4:%.*]] = call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - ; DOPCHECK: [[dotf4:%.*]] = call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf4]]) - ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf4]]) - ; CHECK: [[splatinsertf4:%.*]] = insertelement <4 x float> poison, float [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <4 x float> [[splatinsertf4]], <4 x float> poison, <4 x i32> zeroinitializer - ; CHECK: fmul <4 x float> %p0, %.splat - - %hlsl.normalize = call <4 x float> @llvm.dx.normalize.v4f32(<4 x float> %p0) - ret <4 x float> %hlsl.normalize -} +; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK +; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK + +; Make sure dxil operation function calls for normalize are generated for half/float. + +declare half @llvm.dx.normalize.f16(half) +declare <2 x half> @llvm.dx.normalize.v2f16(<2 x half>) +declare <3 x half> @llvm.dx.normalize.v3f16(<3 x half>) +declare <4 x half> @llvm.dx.normalize.v4f16(<4 x half>) + +declare float @llvm.dx.normalize.f32(float) +declare <2 x float> @llvm.dx.normalize.v2f32(<2 x float>) +declare <3 x float> @llvm.dx.normalize.v3f32(<3 x float>) +declare <4 x float> @llvm.dx.normalize.v4f32(<4 x float>) + +define noundef half @test_normalize_half(half noundef %p0) { +entry: + ; CHECK: fdiv half %p0, %p0 + %hlsl.normalize = call half @llvm.dx.normalize.f16(half %p0) + ret half %hlsl.normalize +} + +define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) { +entry: + ; EXPCHECK: [[doth2:%.*]] = call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + ; DOPCHECK: [[doth2:%.*]] = call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth2]]) + ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth2]]) + ; CHECK: [[splatinserth2:%.*]] = insertelement <2 x half> poison, half [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] = shufflevector <2 x half> [[splatinserth2]], <2 x half> poison, <2 x i32> zeroinitializer + ; CHECK: fmul <2 x half> %p0, [[splat]] + + %hlsl.normalize = call <2 x half> @llvm.dx.normalize.v2f16(<2 x half> %p0) + ret <2 x half> %hlsl.normalize +} + +define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) { +entry: + ; EXPCHECK: [[doth3:%.*]] = call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}) + ; DOPCHECK: [[doth3:%.*]] = call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth3]]) + ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth3]]) + ; CHECK: [[splatinserth3:%.*]] = insertelement <3 x half> poison, half [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <3 x half> [[splatinserth3]], <3 x half> poison, <3 x i32> zeroinitializer + ; CHECK: fmul <3 x half> %p0, %.splat + + %hlsl.normalize = call <3 x half> @llvm.dx.normalize.v3f16(<3 x half> %p0) + ret <3 x half> %hlsl.normalize +} + +define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) { +entry: + ; EXPCHECK: [[doth4:%.*]] = call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}) + ; DOPCHECK: [[doth4:%.*]] = call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth4]]) + ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth4]]) + ; CHECK: [[splatinserth4:%.*]] = insertelement <4 x half> poison, half [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <4 x half> [[splatinserth4]], <4 x half> poison, <4 x i32> zeroinitializer + ; CHECK: fmul <4 x half> %p0, %.splat + + %hlsl.normalize = call <4 x half> @llvm.dx.normalize.v4f16(<4 x half> %p0) + ret <4 x half> %hlsl.normalize +} + +define noundef float @test_normalize_float(float noundef %p0) { +entry: + ; CHECK: fdiv float %p0, %p0 + %hlsl.normalize = call float @llvm.dx.normalize.f32(float %p0) + ret float %hlsl.normalize +} + +define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) { +entry: + ; EXPCHECK: [[dotf2:%.*]] = call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}) + ; DOPCHECK: [[dotf2:%.*]] = call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf2]]) + ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf2]]) + ; CHECK: [[splatinsertf2:%.*]] = insertelement <2 x float> poison, float [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <2 x float> [[splatinsertf2]], <2 x float> poison, <2 x i32> zeroinitializer + ; CHECK: fmul <2 x float> %p0, %.splat + + %hlsl.normalize = call <2 x float> @llvm.dx.normalize.v2f32(<2 x float> %p0) + ret <2 x float> %hlsl.normalize +} + +define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) { +entry: + ; EXPCHECK: [[dotf3:%.*]] = call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}) + ; DOPCHECK: [[dotf3:%.*]] = call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf3]]) + ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf3]]) + ; CHECK: [[splatinsertf3:%.*]] = insertelement <3 x float> poison, float [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <3 x float> [[splatinsertf3]], <3 x float> poison, <3 x i32> zeroinitializer + ; CHECK: fmul <3 x float> %p0, %.splat + + %hlsl.normalize = call <3 x float> @llvm.dx.normalize.v3f32(<3 x float> %p0) + ret <3 x float> %hlsl.normalize +} + +define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) { +entry: + ; EXPCHECK: [[dotf4:%.*]] = call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}) + ; DOPCHECK: [[dotf4:%.*]] = call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf4]]) + ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf4]]) + ; CHECK: [[splatinsertf4:%.*]] = insertelement <4 x float> poison, float [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <4 x float> [[splatinsertf4]], <4 x float> poison, <4 x i32> zeroinitializer + ; CHECK: fmul <4 x float> %p0, %.splat + + %hlsl.normalize = call <4 x float> @llvm.dx.normalize.v4f32(<4 x float> %p0) + ret <4 x float> %hlsl.normalize +} diff --git a/llvm/test/CodeGen/DirectX/normalize_error.ll b/llvm/test/CodeGen/DirectX/normalize_error.ll index 35a91c0cdc24df744405b6188966678f02eed554..3041d2ecdd923a40bcf522dc75a1fd1eda4a334f 100644 --- a/llvm/test/CodeGen/DirectX/normalize_error.ll +++ b/llvm/test/CodeGen/DirectX/normalize_error.ll @@ -1,10 +1,10 @@ -; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s - -; DXIL operation normalize does not support double overload type -; CHECK: Cannot create Dot2 operation: Invalid overload type - -define noundef <2 x double> @test_normalize_double2(<2 x double> noundef %p0) { -entry: - %hlsl.normalize = call <2 x double> @llvm.dx.normalize.v2f32(<2 x double> %p0) - ret <2 x double> %hlsl.normalize -} +; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s + +; DXIL operation normalize does not support double overload type +; CHECK: Cannot create Dot2 operation: Invalid overload type + +define noundef <2 x double> @test_normalize_double2(<2 x double> noundef %p0) { +entry: + %hlsl.normalize = call <2 x double> @llvm.dx.normalize.v2f32(<2 x double> %p0) + ret <2 x double> %hlsl.normalize +} diff --git a/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll b/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll index 6066a0033e45dee347619f82faa12990aa1b813d..734ff1b4dd2d57e74955196cfd7bb5f12a4bd31d 100644 --- a/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll +++ b/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll @@ -1,32 +1,34 @@ ; RUN: llc --filetype=asm %s -o - | FileCheck %s target triple = "dxil-unknown-shadermodel6.7-library" -define i64 @test(ptr %p) { +define i64 @test(ptr %p) #0 { %v = load i64, ptr %p ret i64 %v } -; CHECK: define internal i64 @test(ptr %p) { +; CHECK: define i64 @test(ptr %p) #0 { ; CHECK-NEXT: %v = load i64, ptr %p, align 8 ; CHECK-NEXT: ret i64 %v -define i64 @test2(ptr %p) { +define i64 @test2(ptr %p) #0 { store i64 0, ptr %p %v = load i64, ptr %p ret i64 %v } -; CHECK: define internal i64 @test2(ptr %p) { +; CHECK: define i64 @test2(ptr %p) #0 { ; CHECK-NEXT: store i64 0, ptr %p ; CHECK-NEXT: %v = load i64, ptr %p, align 8 ; CHECK-NEXT: ret i64 %v -define i32 @test3(ptr %0) { +define i32 @test3(ptr %0) #0 { %2 = getelementptr i32, ptr %0, i32 4 %3 = load i32, ptr %2 ret i32 %3 } -; CHECK: define internal i32 @test3(ptr %0) { +attributes #0 = { convergent norecurse nounwind "hlsl.export"} + +; CHECK: define i32 @test3(ptr %0) #0 { ; CHECK-NEXT: %2 = getelementptr i32, ptr %0, i32 4 ; CHECK-NEXT: %3 = load i32, ptr %2 diff --git a/llvm/test/CodeGen/DirectX/scalar-load.ll b/llvm/test/CodeGen/DirectX/scalar-load.ll index 612e5646f5a0811278d1209b3292bf863905081a..b911a8f7855bb86be1db80f1adc51c36860ca779 100644 --- a/llvm/test/CodeGen/DirectX/scalar-load.ll +++ b/llvm/test/CodeGen/DirectX/scalar-load.ll @@ -20,7 +20,7 @@ ; CHECK-LABEL: load_array_vec_test -define <4 x i32> @load_array_vec_test() { +define <4 x i32> @load_array_vec_test() #0 { ; CHECK-COUNT-8: load i32, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align 4 ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4 %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 0), align 4 @@ -30,7 +30,7 @@ define <4 x i32> @load_array_vec_test() { } ; CHECK-LABEL: load_vec_test -define <4 x i32> @load_vec_test() { +define <4 x i32> @load_vec_test() #0 { ; CHECK-COUNT-4: load i32, ptr addrspace(3) {{(@vecData.scalarized|getelementptr \(i32, ptr addrspace\(3\) @vecData.scalarized, i32 .*\)|%.*)}}, align {{.*}} ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4 %1 = load <4 x i32>, <4 x i32> addrspace(3)* @"vecData", align 4 @@ -38,7 +38,7 @@ define <4 x i32> @load_vec_test() { } ; CHECK-LABEL: load_static_array_of_vec_test -define <4 x i32> @load_static_array_of_vec_test(i32 %index) { +define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 { ; CHECK: getelementptr [3 x [4 x i32]], ptr @staticArrayOfVecData.scalarized, i32 0, i32 %index ; CHECK-COUNT-4: load i32, ptr {{.*}}, align 4 ; CHECK-NOT: load i32, ptr {{.*}}, align 4 @@ -48,7 +48,7 @@ define <4 x i32> @load_static_array_of_vec_test(i32 %index) { } ; CHECK-LABEL: multid_load_test -define <4 x i32> @multid_load_test() { +define <4 x i32> @multid_load_test() #0 { ; CHECK-COUNT-8: load i32, ptr addrspace(3) {{(.*@groushared2dArrayofVectors.scalarized.*|%.*)}}, align 4 ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4 %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4 @@ -56,3 +56,5 @@ define <4 x i32> @multid_load_test() { %3 = add <4 x i32> %1, %2 ret <4 x i32> %3 } + +attributes #0 = { convergent norecurse nounwind "hlsl.export"} diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll index 7734d32bca58cd5a1df9cba980c28ffac1756b5b..c45481e8cae14f2819579f1433f2d9951bf99ade 100644 --- a/llvm/test/CodeGen/DirectX/scalar-store.ll +++ b/llvm/test/CodeGen/DirectX/scalar-store.ll @@ -12,7 +12,7 @@ ; CHECK-NOT: @vecData ; CHECK-LABEL: store_array_vec_test -define void @store_array_vec_test () local_unnamed_addr { +define void @store_array_vec_test () local_unnamed_addr #0 { ; CHECK-COUNT-6: store float {{1|2|3|4|6}}.000000e+00, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align {{4|8|16}} ; CHECK-NOT: store float {{1|2|3|4|6}}.000000e+00, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align {{4|8|16}} store <3 x float> , ptr addrspace(3) @"arrayofVecData", align 16 @@ -21,9 +21,11 @@ define void @store_array_vec_test () local_unnamed_addr { } ; CHECK-LABEL: store_vec_test -define void @store_vec_test(<4 x i32> %inputVec) { +define void @store_vec_test(<4 x i32> %inputVec) #0 { ; CHECK-COUNT-4: store i32 %inputVec.{{.*}}, ptr addrspace(3) {{(@vecData.scalarized|getelementptr \(i32, ptr addrspace\(3\) @vecData.scalarized, i32 .*\)|%.*)}}, align 4 ; CHECK-NOT: store i32 %inputVec.{{.*}}, ptr addrspace(3) store <4 x i32> %inputVec, <4 x i32> addrspace(3)* @"vecData", align 4 ret void } + +attributes #0 = { convergent norecurse nounwind "hlsl.export"} diff --git a/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll index a14c1de5cc4205722555810be119fc1a2c673d5f..0546a5505416f1e80a1ddc55426497e67ab76078 100644 --- a/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll +++ b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll @@ -2,7 +2,7 @@ ; CHECK: target triple = "dxilv1.3-pc-shadermodel6.3-library" ; CHECK-LABEL: cos_sin_float_test -define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) { +define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) #0 { ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0 ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]]) ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1 @@ -23,3 +23,5 @@ define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) { %3 = tail call <4 x float> @llvm.cos.v4f32(<4 x float> %2) ret <4 x float> %3 } + +attributes #0 = { convergent norecurse nounwind "hlsl.export"} diff --git a/llvm/test/CodeGen/DirectX/step.ll b/llvm/test/CodeGen/DirectX/step.ll index 1c9894026c62ec71a7d35f849902a4cecb8ba8c4..6a9b5bf71da8993c792877bb415ae601ae6911f9 100644 --- a/llvm/test/CodeGen/DirectX/step.ll +++ b/llvm/test/CodeGen/DirectX/step.ll @@ -1,78 +1,78 @@ -; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefix=CHECK - -; Make sure dxil operation function calls for step are generated for half/float. - -declare half @llvm.dx.step.f16(half, half) -declare <2 x half> @llvm.dx.step.v2f16(<2 x half>, <2 x half>) -declare <3 x half> @llvm.dx.step.v3f16(<3 x half>, <3 x half>) -declare <4 x half> @llvm.dx.step.v4f16(<4 x half>, <4 x half>) - -declare float @llvm.dx.step.f32(float, float) -declare <2 x float> @llvm.dx.step.v2f32(<2 x float>, <2 x float>) -declare <3 x float> @llvm.dx.step.v3f32(<3 x float>, <3 x float>) -declare <4 x float> @llvm.dx.step.v4f32(<4 x float>, <4 x float>) - -define noundef half @test_step_half(half noundef %p0, half noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt half %p1, %p0 - ; CHECK: %1 = select i1 %0, half 0xH0000, half 0xH3C00 - %hlsl.step = call half @llvm.dx.step.f16(half %p0, half %p1) - ret half %hlsl.step -} - -define noundef <2 x half> @test_step_half2(<2 x half> noundef %p0, <2 x half> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <2 x half> %p1, %p0 - ; CHECK: %1 = select <2 x i1> %0, <2 x half> zeroinitializer, <2 x half> - %hlsl.step = call <2 x half> @llvm.dx.step.v2f16(<2 x half> %p0, <2 x half> %p1) - ret <2 x half> %hlsl.step -} - -define noundef <3 x half> @test_step_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <3 x half> %p1, %p0 - ; CHECK: %1 = select <3 x i1> %0, <3 x half> zeroinitializer, <3 x half> - %hlsl.step = call <3 x half> @llvm.dx.step.v3f16(<3 x half> %p0, <3 x half> %p1) - ret <3 x half> %hlsl.step -} - -define noundef <4 x half> @test_step_half4(<4 x half> noundef %p0, <4 x half> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <4 x half> %p1, %p0 - ; CHECK: %1 = select <4 x i1> %0, <4 x half> zeroinitializer, <4 x half> - %hlsl.step = call <4 x half> @llvm.dx.step.v4f16(<4 x half> %p0, <4 x half> %p1) - ret <4 x half> %hlsl.step -} - -define noundef float @test_step_float(float noundef %p0, float noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt float %p1, %p0 - ; CHECK: %1 = select i1 %0, float 0.000000e+00, float 1.000000e+00 - %hlsl.step = call float @llvm.dx.step.f32(float %p0, float %p1) - ret float %hlsl.step -} - -define noundef <2 x float> @test_step_float2(<2 x float> noundef %p0, <2 x float> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <2 x float> %p1, %p0 - ; CHECK: %1 = select <2 x i1> %0, <2 x float> zeroinitializer, <2 x float> - %hlsl.step = call <2 x float> @llvm.dx.step.v2f32(<2 x float> %p0, <2 x float> %p1) - ret <2 x float> %hlsl.step -} - -define noundef <3 x float> @test_step_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <3 x float> %p1, %p0 - ; CHECK: %1 = select <3 x i1> %0, <3 x float> zeroinitializer, <3 x float> - %hlsl.step = call <3 x float> @llvm.dx.step.v3f32(<3 x float> %p0, <3 x float> %p1) - ret <3 x float> %hlsl.step -} - -define noundef <4 x float> @test_step_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <4 x float> %p1, %p0 - ; CHECK: %1 = select <4 x i1> %0, <4 x float> zeroinitializer, <4 x float> - %hlsl.step = call <4 x float> @llvm.dx.step.v4f32(<4 x float> %p0, <4 x float> %p1) - ret <4 x float> %hlsl.step -} +; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefix=CHECK + +; Make sure dxil operation function calls for step are generated for half/float. + +declare half @llvm.dx.step.f16(half, half) +declare <2 x half> @llvm.dx.step.v2f16(<2 x half>, <2 x half>) +declare <3 x half> @llvm.dx.step.v3f16(<3 x half>, <3 x half>) +declare <4 x half> @llvm.dx.step.v4f16(<4 x half>, <4 x half>) + +declare float @llvm.dx.step.f32(float, float) +declare <2 x float> @llvm.dx.step.v2f32(<2 x float>, <2 x float>) +declare <3 x float> @llvm.dx.step.v3f32(<3 x float>, <3 x float>) +declare <4 x float> @llvm.dx.step.v4f32(<4 x float>, <4 x float>) + +define noundef half @test_step_half(half noundef %p0, half noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt half %p1, %p0 + ; CHECK: %1 = select i1 %0, half 0xH0000, half 0xH3C00 + %hlsl.step = call half @llvm.dx.step.f16(half %p0, half %p1) + ret half %hlsl.step +} + +define noundef <2 x half> @test_step_half2(<2 x half> noundef %p0, <2 x half> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <2 x half> %p1, %p0 + ; CHECK: %1 = select <2 x i1> %0, <2 x half> zeroinitializer, <2 x half> + %hlsl.step = call <2 x half> @llvm.dx.step.v2f16(<2 x half> %p0, <2 x half> %p1) + ret <2 x half> %hlsl.step +} + +define noundef <3 x half> @test_step_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <3 x half> %p1, %p0 + ; CHECK: %1 = select <3 x i1> %0, <3 x half> zeroinitializer, <3 x half> + %hlsl.step = call <3 x half> @llvm.dx.step.v3f16(<3 x half> %p0, <3 x half> %p1) + ret <3 x half> %hlsl.step +} + +define noundef <4 x half> @test_step_half4(<4 x half> noundef %p0, <4 x half> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <4 x half> %p1, %p0 + ; CHECK: %1 = select <4 x i1> %0, <4 x half> zeroinitializer, <4 x half> + %hlsl.step = call <4 x half> @llvm.dx.step.v4f16(<4 x half> %p0, <4 x half> %p1) + ret <4 x half> %hlsl.step +} + +define noundef float @test_step_float(float noundef %p0, float noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt float %p1, %p0 + ; CHECK: %1 = select i1 %0, float 0.000000e+00, float 1.000000e+00 + %hlsl.step = call float @llvm.dx.step.f32(float %p0, float %p1) + ret float %hlsl.step +} + +define noundef <2 x float> @test_step_float2(<2 x float> noundef %p0, <2 x float> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <2 x float> %p1, %p0 + ; CHECK: %1 = select <2 x i1> %0, <2 x float> zeroinitializer, <2 x float> + %hlsl.step = call <2 x float> @llvm.dx.step.v2f32(<2 x float> %p0, <2 x float> %p1) + ret <2 x float> %hlsl.step +} + +define noundef <3 x float> @test_step_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <3 x float> %p1, %p0 + ; CHECK: %1 = select <3 x i1> %0, <3 x float> zeroinitializer, <3 x float> + %hlsl.step = call <3 x float> @llvm.dx.step.v3f32(<3 x float> %p0, <3 x float> %p1) + ret <3 x float> %hlsl.step +} + +define noundef <4 x float> @test_step_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <4 x float> %p1, %p0 + ; CHECK: %1 = select <4 x i1> %0, <4 x float> zeroinitializer, <4 x float> + %hlsl.step = call <4 x float> @llvm.dx.step.v4f32(<4 x float> %p0, <4 x float> %p1) + ret <4 x float> %hlsl.step +} diff --git a/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll b/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll index b0dd89cf90f2b2fc869c7318891b8319f1c9d88a..4223c9094319453fd525f773d6a17fb3076b0f8d 100644 --- a/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll +++ b/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll @@ -12,4 +12,4 @@ define dso_local float @fma(float %0, float %1, float %2) local_unnamed_addr #0 ; CHECK: attributes #0 = { nounwind memory(none) } ; CHECK-NOT: attributes # -attributes #0 = { norecurse nounwind readnone willreturn } +attributes #0 = { norecurse nounwind readnone willreturn "hlsl.export"} diff --git a/llvm/test/CodeGen/Hexagon/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Hexagon/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..c53f2d4df9b62c21a1da30f2f6f677329cbb9528 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/naked-fn-with-frame-pointer.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple hexagon | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { +; CHECK-NEXT: call main +; CHECK-NEXT: } + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: .cfi_def_cfa r30, 8 +; CHECK-NEXT: .cfi_offset r31, -4 +; CHECK-NEXT: .cfi_offset r30, -8 +; CHECK-NEXT: { +; CHECK-NEXT: call main +; CHECK-NEXT: allocframe(r29,#0):raw +; CHECK-NEXT: } + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/Lanai/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Lanai/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..4e148764e478b794682686599bb05017a5327643 --- /dev/null +++ b/llvm/test/CodeGen/Lanai/naked-fn-with-frame-pointer.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple lanai | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: .Lnaked$local: +; CHECK-NEXT: .type .Lnaked$local,@function +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ! %bb.0: +; CHECK-NEXT: add %pc, 0x10, %rca +; CHECK-NEXT: st %rca, [--%sp] +; CHECK-NEXT: bt main +; CHECK-NEXT: nop + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: .Lnormal$local: +; CHECK-NEXT: .type .Lnormal$local,@function +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: ! %bb.0: +; CHECK-NEXT: st %fp, [--%sp] +; CHECK-NEXT: add %sp, 0x8, %fp +; CHECK-NEXT: sub %sp, 0x8, %sp +; CHECK-NEXT: add %pc, 0x10, %rca +; CHECK-NEXT: st %rca, [--%sp] +; CHECK-NEXT: bt main +; CHECK-NEXT: nop + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/LoongArch/annotate-tablejump.ll b/llvm/test/CodeGen/LoongArch/annotate-tablejump.ll new file mode 100644 index 0000000000000000000000000000000000000000..a8c660609d9904e493864410ea9a3665518e1117 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/annotate-tablejump.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 -mattr=+d \ +; RUN: --min-jump-table-entries=4 < %s \ +; RUN: --loongarch-annotate-tablejump \ +; RUN: | FileCheck %s --check-prefix=LA32-JT +; RUN: llc --mtriple=loongarch64 -mattr=+d \ +; RUN: --min-jump-table-entries=4 < %s \ +; RUN: --loongarch-annotate-tablejump \ +; RUN: | FileCheck %s --check-prefix=LA64-JT + +define void @switch_4_arms(i32 %in, ptr %out) nounwind { +; LA32-JT-LABEL: switch_4_arms: +; LA32-JT: # %bb.0: # %entry +; LA32-JT-NEXT: addi.w $a3, $a0, -1 +; LA32-JT-NEXT: ori $a2, $zero, 3 +; LA32-JT-NEXT: bltu $a2, $a3, .LBB0_7 +; LA32-JT-NEXT: # %bb.1: # %entry +; LA32-JT-NEXT: pcalau12i $a4, %pc_hi20(.LJTI0_0) +; LA32-JT-NEXT: addi.w $a4, $a4, %pc_lo12(.LJTI0_0) +; LA32-JT-NEXT: alsl.w $a3, $a3, $a4, 2 +; LA32-JT-NEXT: ld.w $a3, $a3, 0 +; LA32-JT-NEXT: .Ljrtb_0: +; LA32-JT-NEXT: jr $a3 +; LA32-JT-NEXT: .LBB0_2: # %bb1 +; LA32-JT-NEXT: ori $a3, $zero, 4 +; LA32-JT-NEXT: b .LBB0_6 +; LA32-JT-NEXT: .LBB0_3: # %bb2 +; LA32-JT-NEXT: ori $a3, $zero, 3 +; LA32-JT-NEXT: b .LBB0_6 +; LA32-JT-NEXT: .LBB0_4: # %bb3 +; LA32-JT-NEXT: ori $a3, $zero, 2 +; LA32-JT-NEXT: b .LBB0_6 +; LA32-JT-NEXT: .LBB0_5: # %bb4 +; LA32-JT-NEXT: ori $a3, $zero, 1 +; LA32-JT-NEXT: .LBB0_6: # %exit +; LA32-JT-NEXT: st.w $a3, $a1, 0 +; LA32-JT-NEXT: .LBB0_7: # %exit +; LA32-JT-NEXT: addi.w $a3, $a0, -5 +; LA32-JT-NEXT: bltu $a2, $a3, .LBB0_9 +; LA32-JT-NEXT: # %bb.8: # %exit +; LA32-JT-NEXT: pcalau12i $a4, %pc_hi20(.LJTI0_1) +; LA32-JT-NEXT: addi.w $a4, $a4, %pc_lo12(.LJTI0_1) +; LA32-JT-NEXT: alsl.w $a3, $a3, $a4, 2 +; LA32-JT-NEXT: ld.w $a3, $a3, 0 +; LA32-JT-NEXT: .Ljrtb_1: +; LA32-JT-NEXT: jr $a3 +; LA32-JT-NEXT: .LBB0_9: # %exit2 +; LA32-JT-NEXT: ret +; +; LA64-JT-LABEL: switch_4_arms: +; LA64-JT: # %bb.0: # %entry +; LA64-JT-NEXT: addi.w $a0, $a0, 0 +; LA64-JT-NEXT: addi.d $a3, $a0, -1 +; LA64-JT-NEXT: ori $a2, $zero, 3 +; LA64-JT-NEXT: bltu $a2, $a3, .LBB0_7 +; LA64-JT-NEXT: # %bb.1: # %entry +; LA64-JT-NEXT: slli.d $a3, $a3, 3 +; LA64-JT-NEXT: pcalau12i $a4, %pc_hi20(.LJTI0_0) +; LA64-JT-NEXT: addi.d $a4, $a4, %pc_lo12(.LJTI0_0) +; LA64-JT-NEXT: ldx.d $a3, $a3, $a4 +; LA64-JT-NEXT: .Ljrtb_0: +; LA64-JT-NEXT: jr $a3 +; LA64-JT-NEXT: .LBB0_2: # %bb1 +; LA64-JT-NEXT: ori $a3, $zero, 4 +; LA64-JT-NEXT: b .LBB0_6 +; LA64-JT-NEXT: .LBB0_3: # %bb2 +; LA64-JT-NEXT: ori $a3, $zero, 3 +; LA64-JT-NEXT: b .LBB0_6 +; LA64-JT-NEXT: .LBB0_4: # %bb3 +; LA64-JT-NEXT: ori $a3, $zero, 2 +; LA64-JT-NEXT: b .LBB0_6 +; LA64-JT-NEXT: .LBB0_5: # %bb4 +; LA64-JT-NEXT: ori $a3, $zero, 1 +; LA64-JT-NEXT: .LBB0_6: # %exit +; LA64-JT-NEXT: st.w $a3, $a1, 0 +; LA64-JT-NEXT: .LBB0_7: # %exit +; LA64-JT-NEXT: addi.d $a3, $a0, -5 +; LA64-JT-NEXT: bltu $a2, $a3, .LBB0_9 +; LA64-JT-NEXT: # %bb.8: # %exit +; LA64-JT-NEXT: slli.d $a3, $a3, 3 +; LA64-JT-NEXT: pcalau12i $a4, %pc_hi20(.LJTI0_1) +; LA64-JT-NEXT: addi.d $a4, $a4, %pc_lo12(.LJTI0_1) +; LA64-JT-NEXT: ldx.d $a3, $a3, $a4 +; LA64-JT-NEXT: .Ljrtb_1: +; LA64-JT-NEXT: jr $a3 +; LA64-JT-NEXT: .LBB0_9: # %exit2 +; LA64-JT-NEXT: ret +entry: + switch i32 %in, label %exit [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + ] +bb1: + store i32 4, ptr %out + br label %exit +bb2: + store i32 3, ptr %out + br label %exit +bb3: + store i32 2, ptr %out + br label %exit +bb4: + store i32 1, ptr %out + br label %exit +exit: + switch i32 %in, label %exit2 [ + i32 5, label %bb1 + i32 6, label %bb2 + i32 7, label %bb3 + i32 8, label %bb4 + ] +exit2: + ret void +} + +; UTC_ARGS: --disable + +; LA32-JT-LABEL: .LJTI0_0: +; LA32-JT: .section .discard.tablejump_annotate,"",@progbits +; LA32-JT-NEXT: .word .Ljrtb_0 +; LA32-JT-NEXT: .word .LJTI0_0 +; LA32-JT-NEXT: .word .Ljrtb_1 +; LA32-JT-NEXT: .word .LJTI0_1 + +; UTC_ARGS: --disable +; LA64-JT-LABEL: .LJTI0_0: +; LA64-JT: .section .discard.tablejump_annotate,"",@progbits +; LA64-JT-NEXT: .dword .Ljrtb_0 +; LA64-JT-NEXT: .dword .LJTI0_0 +; LA64-JT-NEXT: .dword .Ljrtb_1 +; LA64-JT-NEXT: .dword .LJTI0_1 diff --git a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll new file mode 100644 index 0000000000000000000000000000000000000000..3f38bbed881a3270558e1a972bafaf561f9520e9 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE + +;; Exercise the 'fdiv' LLVM IR: https://llvm.org/docs/LangRef.html#fdiv-instruction + +define float @fdiv_s(float %x, float %y) { +; LA32F-LABEL: fdiv_s: +; LA32F: # %bb.0: +; LA32F-NEXT: fdiv.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: fdiv_s: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frecipe.s $fa2, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 +; LA32F-FRECIPE-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: fdiv_s: +; LA64D: # %bb.0: +; LA64D-NEXT: fdiv.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: fdiv_s: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frecipe.s $fa2, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 +; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 +; LA64D-FRECIPE-NEXT: ret + %div = fdiv fast float %x, %y + ret float %div +} + +define double @fdiv_d(double %x, double %y) { +; LA32F-LABEL: fdiv_d: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: fdiv_d: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -16 +; LA32F-FRECIPE-NEXT: .cfi_def_cfa_offset 16 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: .cfi_offset 1, -4 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 16 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: fdiv_d: +; LA64D: # %bb.0: +; LA64D-NEXT: fdiv.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: fdiv_d: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; LA64D-FRECIPE-NEXT: frecipe.d $fa3, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa1, $fa3, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa2, $fa2, $fa3, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa0, $fa1, $fa3, $fa0 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa2, $fa0, $fa3 +; LA64D-FRECIPE-NEXT: ret + %div = fdiv fast double %x, %y + ret double %div +} diff --git a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll new file mode 100644 index 0000000000000000000000000000000000000000..388ae6321f664a7c95854586509d63c35a2be4ec --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,797 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE + + +declare float @llvm.sqrt.f32(float) +declare double @llvm.sqrt.f64(double) + +define float @frsqrt_f32(float %a) nounwind { +; LA32F-LABEL: frsqrt_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: frsqrt_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; LA32F-FRECIPE-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: frsqrt_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: frsqrt.s $fa0, $fa0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: frsqrt_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; LA64D-FRECIPE-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: ret + + %1 = call fast float @llvm.sqrt.f32(float %a) + %2 = fdiv fast float 1.0, %1 + ret float %2 +} + +define double @frsqrt_f64(double %a) nounwind { +; LA32F-LABEL: frsqrt_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: st.w $ra, $sp, 12 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $a2, $a0 +; LA32F-NEXT: move $a3, $a1 +; LA32F-NEXT: lu12i.w $a1, 261888 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: ld.w $ra, $sp, 12 +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: frsqrt_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -16 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $a2, $a0 +; LA32F-FRECIPE-NEXT: move $a3, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 261888 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 16 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: frsqrt_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: frsqrt_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI1_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa0, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: ret + %1 = call fast double @llvm.sqrt.f64(double %a) + %2 = fdiv fast double 1.0, %1 + ret double %2 +} + +define double @sqrt_simplify_before_recip_3_uses_f64(double %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -32 +; LA32F-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-NEXT: move $fp, $a3 +; LA32F-NEXT: move $s0, $a2 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $s1, $a0 +; LA32F-NEXT: move $s2, $a1 +; LA32F-NEXT: lu12i.w $a1, 261888 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s3, $a0 +; LA32F-NEXT: move $s4, $a1 +; LA32F-NEXT: lu12i.w $a1, 263248 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: st.w $s3, $s0, 0 +; LA32F-NEXT: st.w $s4, $s0, 4 +; LA32F-NEXT: st.w $a0, $fp, 0 +; LA32F-NEXT: st.w $a1, $fp, 4 +; LA32F-NEXT: move $a0, $s1 +; LA32F-NEXT: move $a1, $s2 +; LA32F-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 32 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -32 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: move $fp, $a3 +; LA32F-FRECIPE-NEXT: move $s0, $a2 +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $s1, $a0 +; LA32F-FRECIPE-NEXT: move $s2, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 261888 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s3, $a0 +; LA32F-FRECIPE-NEXT: move $s4, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263248 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: st.w $s3, $s0, 0 +; LA32F-FRECIPE-NEXT: st.w $s4, $s0, 4 +; LA32F-FRECIPE-NEXT: st.w $a0, $fp, 0 +; LA32F-FRECIPE-NEXT: st.w $a1, $fp, 4 +; LA32F-FRECIPE-NEXT: move $a0, $s1 +; LA32F-FRECIPE-NEXT: move $a1, $s2 +; LA32F-FRECIPE-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 32 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; LA64D-NEXT: fsqrt.d $fa1, $fa0 +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa1 +; LA64D-NEXT: fst.d $fa0, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: fmov.d $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI2_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_2) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a2, %pc_lo12(.LCPI2_2) +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %rsqrt = fdiv fast double 1.0, %sqrt + %r = fdiv fast double 42.0, %sqrt + %sqrt_fast = fdiv fast double %x, %sqrt + store double %rsqrt, ptr %p1, align 8 + store double %r, ptr %p2, align 8 + ret double %sqrt_fast +} + + +define double @sqrt_simplify_before_recip_3_uses_order_f64(double %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -32 +; LA32F-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-NEXT: move $fp, $a3 +; LA32F-NEXT: move $s0, $a2 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $s1, $a0 +; LA32F-NEXT: move $s2, $a1 +; LA32F-NEXT: lu12i.w $a1, 263248 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s3, $a0 +; LA32F-NEXT: move $s4, $a1 +; LA32F-NEXT: lu12i.w $a1, 263256 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: st.w $s3, $s0, 0 +; LA32F-NEXT: st.w $s4, $s0, 4 +; LA32F-NEXT: st.w $a0, $fp, 0 +; LA32F-NEXT: st.w $a1, $fp, 4 +; LA32F-NEXT: move $a0, $s1 +; LA32F-NEXT: move $a1, $s2 +; LA32F-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 32 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -32 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: move $fp, $a3 +; LA32F-FRECIPE-NEXT: move $s0, $a2 +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $s1, $a0 +; LA32F-FRECIPE-NEXT: move $s2, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263248 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s3, $a0 +; LA32F-FRECIPE-NEXT: move $s4, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263256 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: st.w $s3, $s0, 0 +; LA32F-FRECIPE-NEXT: st.w $s4, $s0, 4 +; LA32F-FRECIPE-NEXT: st.w $a0, $fp, 0 +; LA32F-FRECIPE-NEXT: st.w $a1, $fp, 4 +; LA32F-FRECIPE-NEXT: move $a0, $s1 +; LA32F-FRECIPE-NEXT: move $a1, $s2 +; LA32F-FRECIPE-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 32 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; LA64D-NEXT: fld.d $fa1, $a2, %pc_lo12(.LCPI3_0) +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_1) +; LA64D-NEXT: fsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa1, $fa1, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa0 +; LA64D-NEXT: fst.d $fa1, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_2) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_3) +; LA64D-FRECIPE-NEXT: fld.d $fa4, $a2, %pc_lo12(.LCPI3_3) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a1, 0 +; LA64D-FRECIPE-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %sqrt_fast = fdiv fast double %x, %sqrt + %r1 = fdiv fast double 42.0, %sqrt + %r2 = fdiv fast double 43.0, %sqrt + store double %r1, ptr %p1, align 8 + store double %r2, ptr %p2, align 8 + ret double %sqrt_fast +} + +define double @sqrt_simplify_before_recip_4_uses_f64(double %x, ptr %p1, ptr %p2, ptr %p3) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -48 +; LA32F-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32F-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s7, $sp, 8 # 4-byte Folded Spill +; LA32F-NEXT: move $fp, $a4 +; LA32F-NEXT: move $s0, $a3 +; LA32F-NEXT: move $s1, $a2 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $s2, $a0 +; LA32F-NEXT: move $s3, $a1 +; LA32F-NEXT: lu12i.w $a1, 261888 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s2 +; LA32F-NEXT: move $a3, $s3 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s4, $a0 +; LA32F-NEXT: move $s5, $a1 +; LA32F-NEXT: lu12i.w $a1, 263248 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s2 +; LA32F-NEXT: move $a3, $s3 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s6, $a0 +; LA32F-NEXT: move $s7, $a1 +; LA32F-NEXT: lu12i.w $a1, 263256 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s2 +; LA32F-NEXT: move $a3, $s3 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: st.w $s4, $s1, 0 +; LA32F-NEXT: st.w $s5, $s1, 4 +; LA32F-NEXT: st.w $s6, $s0, 0 +; LA32F-NEXT: st.w $s7, $s0, 4 +; LA32F-NEXT: st.w $a0, $fp, 0 +; LA32F-NEXT: st.w $a1, $fp, 4 +; LA32F-NEXT: move $a0, $s2 +; LA32F-NEXT: move $a1, $s3 +; LA32F-NEXT: ld.w $s7, $sp, 8 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 48 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -48 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s7, $sp, 8 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: move $fp, $a4 +; LA32F-FRECIPE-NEXT: move $s0, $a3 +; LA32F-FRECIPE-NEXT: move $s1, $a2 +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $s2, $a0 +; LA32F-FRECIPE-NEXT: move $s3, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 261888 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s2 +; LA32F-FRECIPE-NEXT: move $a3, $s3 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s4, $a0 +; LA32F-FRECIPE-NEXT: move $s5, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263248 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s2 +; LA32F-FRECIPE-NEXT: move $a3, $s3 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s6, $a0 +; LA32F-FRECIPE-NEXT: move $s7, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263256 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s2 +; LA32F-FRECIPE-NEXT: move $a3, $s3 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: st.w $s4, $s1, 0 +; LA32F-FRECIPE-NEXT: st.w $s5, $s1, 4 +; LA32F-FRECIPE-NEXT: st.w $s6, $s0, 0 +; LA32F-FRECIPE-NEXT: st.w $s7, $s0, 4 +; LA32F-FRECIPE-NEXT: st.w $a0, $fp, 0 +; LA32F-FRECIPE-NEXT: st.w $a1, $fp, 4 +; LA32F-FRECIPE-NEXT: move $a0, $s2 +; LA32F-FRECIPE-NEXT: move $a1, $s3 +; LA32F-FRECIPE-NEXT: ld.w $s7, $sp, 8 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 48 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; LA64D-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; LA64D-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; LA64D-NEXT: fsqrt.d $fa1, $fa0 +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa1 +; LA64D-NEXT: fdiv.d $fa3, $fa3, $fa1 +; LA64D-NEXT: fst.d $fa0, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: fst.d $fa3, $a2, 0 +; LA64D-NEXT: fmov.d $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_2) +; LA64D-FRECIPE-NEXT: fld.d $fa4, $a3, %pc_lo12(.LCPI4_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_3) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a3, %pc_lo12(.LCPI4_3) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa3, $a2, 0 +; LA64D-FRECIPE-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %rsqrt = fdiv fast double 1.0, %sqrt + %r1 = fdiv fast double 42.0, %sqrt + %r2 = fdiv fast double 43.0, %sqrt + %sqrt_fast = fdiv fast double %x, %sqrt + store double %rsqrt, ptr %p1, align 8 + store double %r1, ptr %p2, align 8 + store double %r2, ptr %p3, align 8 + ret double %sqrt_fast +} + +define float @sqrt_simplify_before_recip_3_uses_f32(float %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA32F-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI5_0) +; LA32F-NEXT: fsqrt.s $fa1, $fa0 +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA32F-NEXT: fst.s $fa0, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: fmov.s $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI5_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_1) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI5_1) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_2) +; LA32F-FRECIPE-NEXT: fld.s $fa5, $a2, %pc_lo12(.LCPI5_2) +; LA32F-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa5 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA64D-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI5_0) +; LA64D-NEXT: fsqrt.s $fa1, $fa0 +; LA64D-NEXT: frsqrt.s $fa0, $fa0 +; LA64D-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA64D-NEXT: fst.s $fa0, $a0, 0 +; LA64D-NEXT: fst.s $fa2, $a1, 0 +; LA64D-NEXT: fmov.s $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI5_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_1) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI5_1) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_2) +; LA64D-FRECIPE-NEXT: fld.s $fa5, $a2, %pc_lo12(.LCPI5_2) +; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: ret +; + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %rsqrt = fdiv fast float 1.0, %sqrt + %r = fdiv fast float 42.0, %sqrt + %sqrt_fast = fdiv fast float %x, %sqrt + store float %rsqrt, ptr %p1, align 8 + store float %r, ptr %p2, align 8 + ret float %sqrt_fast +} + +define float @sqrt_simplify_before_recip_4_uses_f32(float %x, ptr %p1, ptr %p2, ptr %p3) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA32F-NEXT: fld.s $fa2, $a3, %pc_lo12(.LCPI6_0) +; LA32F-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA32F-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA32F-NEXT: fsqrt.s $fa1, $fa0 +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA32F-NEXT: fdiv.s $fa3, $fa3, $fa1 +; LA32F-NEXT: fst.s $fa0, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: fst.s $fa3, $a2, 0 +; LA32F-NEXT: fmov.s $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA32F-FRECIPE-NEXT: fld.s $fa1, $a3, %pc_lo12(.LCPI6_0) +; LA32F-FRECIPE-NEXT: frsqrte.s $fa2, $fa0 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fmadd.s $fa1, $fa3, $fa2, $fa1 +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_2) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a3, %pc_lo12(.LCPI6_2) +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_3) +; LA32F-FRECIPE-NEXT: fld.s $fa5, $a3, %pc_lo12(.LCPI6_3) +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa2, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa2, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa1, $fa5 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa3, $a2, 0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA64D-NEXT: fld.s $fa2, $a3, %pc_lo12(.LCPI6_0) +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA64D-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA64D-NEXT: fsqrt.s $fa1, $fa0 +; LA64D-NEXT: frsqrt.s $fa0, $fa0 +; LA64D-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA64D-NEXT: fdiv.s $fa3, $fa3, $fa1 +; LA64D-NEXT: fst.s $fa0, $a0, 0 +; LA64D-NEXT: fst.s $fa2, $a1, 0 +; LA64D-NEXT: fst.s $fa3, $a2, 0 +; LA64D-NEXT: fmov.s $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA64D-FRECIPE-NEXT: fld.s $fa1, $a3, %pc_lo12(.LCPI6_0) +; LA64D-FRECIPE-NEXT: frsqrte.s $fa2, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fmadd.s $fa1, $fa3, $fa2, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_2) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a3, %pc_lo12(.LCPI6_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_3) +; LA64D-FRECIPE-NEXT: fld.s $fa5, $a3, %pc_lo12(.LCPI6_3) +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa2, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa2, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa3, $a2, 0 +; LA64D-FRECIPE-NEXT: ret +; + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %rsqrt = fdiv fast float 1.0, %sqrt + %r1 = fdiv fast float 42.0, %sqrt + %r2 = fdiv fast float 43.0, %sqrt + %sqrt_fast = fdiv fast float %x, %sqrt + store float %rsqrt, ptr %p1, align 8 + store float %r1, ptr %p2, align 8 + store float %r2, ptr %p3, align 8 + ret float %sqrt_fast +} + +define float @sqrt_simplify_before_recip_3_uses_order_f32(float %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA32F-NEXT: fld.s $fa1, $a2, %pc_lo12(.LCPI7_0) +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA32F-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_1) +; LA32F-NEXT: fsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa1, $fa1, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa0 +; LA32F-NEXT: fst.s $fa1, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA32F-FRECIPE-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_1) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa4, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmadd.s $fa2, $fa4, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_2) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_2) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_3) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI7_3) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a1, 0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA64D-NEXT: fld.s $fa1, $a2, %pc_lo12(.LCPI7_0) +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA64D-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_1) +; LA64D-NEXT: fsqrt.s $fa0, $fa0 +; LA64D-NEXT: fdiv.s $fa1, $fa1, $fa0 +; LA64D-NEXT: fdiv.s $fa2, $fa2, $fa0 +; LA64D-NEXT: fst.s $fa1, $a0, 0 +; LA64D-NEXT: fst.s $fa2, $a1, 0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA64D-FRECIPE-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_1) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_2) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_3) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI7_3) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a1, 0 +; LA64D-FRECIPE-NEXT: ret +; + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %sqrt_fast = fdiv fast float %x, %sqrt + %r1 = fdiv fast float 42.0, %sqrt + %r2 = fdiv fast float 43.0, %sqrt + store float %r1, ptr %p1, align 8 + store float %r2, ptr %p2, align 8 + ret float %sqrt_fast +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll new file mode 100644 index 0000000000000000000000000000000000000000..769d9ef81faf3918e059fb8d333c8187acd54981 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s + +define void @fdiv_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v8f32: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvld $xr1, $a2, 0 +; FAULT-NEXT: xvfdiv.s $xr0, $xr0, $xr1 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvfrecipe.s $xr2, $xr0 +; CHECK-NEXT: xvfmul.s $xr3, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.s $xr0, $xr0, $xr3, $xr1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr2, $xr0, $xr3 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %v1 = load <8 x float>, ptr %a1 + %v2 = fdiv fast <8 x float> %v0, %v1 + store <8 x float> %v2, ptr %res + ret void +} + +define void @fdiv_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v4f64: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvld $xr1, $a2, 0 +; FAULT-NEXT: xvfdiv.d $xr0, $xr0, $xr1 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: lu52i.d $a1, $zero, -1025 +; CHECK-NEXT: xvreplgr2vr.d $xr2, $a1 +; CHECK-NEXT: xvfrecipe.d $xr3, $xr0 +; CHECK-NEXT: xvfmadd.d $xr2, $xr0, $xr3, $xr2 +; CHECK-NEXT: xvfnmsub.d $xr2, $xr2, $xr3, $xr3 +; CHECK-NEXT: xvfmul.d $xr3, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr3, $xr1 +; CHECK-NEXT: xvfmadd.d $xr0, $xr2, $xr0, $xr3 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %v1 = load <4 x double>, ptr %a1 + %v2 = fdiv fast <4 x double> %v0, %v1 + store <4 x double> %v2, ptr %res + ret void +} + +;; 1.0 / vec +define void @one_fdiv_v8f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v8f32: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrecip.s $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrecipe.s $xr1, $xr0 +; CHECK-NEXT: lu12i.w $a1, -264192 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.s $xr0, $xr0, $xr1, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %div = fdiv fast <8 x float> , %v0 + store <8 x float> %div, ptr %res + ret void +} + +define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v4f64: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrecip.d $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrecipe.d $xr1, $xr0 +; CHECK-NEXT: lu52i.d $a1, $zero, 1023 +; CHECK-NEXT: xvreplgr2vr.d $xr2, $a1 +; CHECK-NEXT: xvfnmsub.d $xr3, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr1, $xr1, $xr3, $xr1 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr0, $xr1, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %div = fdiv fast <4 x double> , %v0 + store <4 x double> %div, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll new file mode 100644 index 0000000000000000000000000000000000000000..48fd12697417acfe130a8928ba4fae6392c576e3 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s + +;; 1.0 / (fsqrt vec) +define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v8f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrsqrt.s $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_div_sqrt_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrsqrte.s $xr1, $xr0 +; CHECK-NEXT: xvfmul.s $xr1, $xr0, $xr1 +; CHECK-NEXT: xvfmul.s $xr0, $xr0, $xr1 +; CHECK-NEXT: lu12i.w $a1, -261120 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: lu12i.w $a1, -266240 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmul.s $xr1, $xr1, $xr2 +; CHECK-NEXT: xvfmul.s $xr0, $xr1, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0, align 16 + %sqrt = call fast <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0) + %div = fdiv fast <8 x float> , %sqrt + store <8 x float> %div, ptr %res, align 16 + ret void +} + +define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v4f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrsqrt.d $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_div_sqrt_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrsqrte.d $xr1, $xr0 +; CHECK-NEXT: xvfmul.d $xr1, $xr0, $xr1 +; CHECK-NEXT: xvfmul.d $xr2, $xr0, $xr1 +; CHECK-NEXT: ori $a1, $zero, 0 +; CHECK-NEXT: lu32i.d $a1, -524288 +; CHECK-NEXT: lu52i.d $a1, $a1, -1024 +; CHECK-NEXT: xvreplgr2vr.d $xr3, $a1 +; CHECK-NEXT: xvfmadd.d $xr2, $xr2, $xr1, $xr3 +; CHECK-NEXT: lu52i.d $a1, $zero, -1026 +; CHECK-NEXT: xvreplgr2vr.d $xr4, $a1 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr4 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr2 +; CHECK-NEXT: xvfmul.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvfmadd.d $xr0, $xr0, $xr1, $xr3 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr4 +; CHECK-NEXT: xvfmul.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0, align 16 + %sqrt = call fast <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0) + %div = fdiv fast <4 x double> , %sqrt + store <4 x double> %div, ptr %res, align 16 + ret void +} + +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll new file mode 100644 index 0000000000000000000000000000000000000000..21dbbf310ad8703a45d3e7228db9566144043f2c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s + +define void @fdiv_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vld $vr1, $a2, 0 +; FAULT-NEXT: vfdiv.s $vr0, $vr0, $vr1 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vfrecipe.s $vr2, $vr0 +; CHECK-NEXT: vfmul.s $vr3, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.s $vr0, $vr0, $vr3, $vr1 +; CHECK-NEXT: vfmadd.s $vr0, $vr2, $vr0, $vr3 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %v1 = load <4 x float>, ptr %a1 + %v2 = fdiv fast <4 x float> %v0, %v1 + store <4 x float> %v2, ptr %res + ret void +} + +define void @fdiv_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vld $vr1, $a2, 0 +; FAULT-NEXT: vfdiv.d $vr0, $vr0, $vr1 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: lu52i.d $a1, $zero, -1025 +; CHECK-NEXT: vreplgr2vr.d $vr2, $a1 +; CHECK-NEXT: vfrecipe.d $vr3, $vr0 +; CHECK-NEXT: vfmadd.d $vr2, $vr0, $vr3, $vr2 +; CHECK-NEXT: vfnmsub.d $vr2, $vr2, $vr3, $vr3 +; CHECK-NEXT: vfmul.d $vr3, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr3, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr2, $vr0, $vr3 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %v1 = load <2 x double>, ptr %a1 + %v2 = fdiv fast <2 x double> %v0, %v1 + store <2 x double> %v2, ptr %res + ret void +} + +;; 1.0 / vec +define void @one_fdiv_v4f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrecip.s $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrecipe.s $vr1, $vr0 +; CHECK-NEXT: lu12i.w $a1, -264192 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.s $vr0, $vr0, $vr1, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %div = fdiv fast <4 x float> , %v0 + store <4 x float> %div, ptr %res + ret void +} + +define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrecip.d $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrecipe.d $vr1, $vr0 +; CHECK-NEXT: lu52i.d $a1, $zero, 1023 +; CHECK-NEXT: vreplgr2vr.d $vr2, $a1 +; CHECK-NEXT: vfnmsub.d $vr3, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr1, $vr1, $vr3, $vr1 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr0, $vr1, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %div = fdiv fast <2 x double> , %v0 + store <2 x double> %div, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll new file mode 100644 index 0000000000000000000000000000000000000000..912d06242f7d3eb5b8275ce7ef94c5dc2cb2a354 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s + +;; 1.0 / (fsqrt vec) +define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrsqrt.s $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL one_div_sqrt_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrsqrte.s $vr1, $vr0 +; CHECK-NEXT: vfmul.s $vr1, $vr0, $vr1 +; CHECK-NEXT: vfmul.s $vr0, $vr0, $vr1 +; CHECK-NEXT: lu12i.w $a1, -261120 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: lu12i.w $a1, -266240 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmul.s $vr1, $vr1, $vr2 +; CHECK-NEXT: vfmul.s $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0, align 16 + %sqrt = call fast <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0) + %div = fdiv fast <4 x float> , %sqrt + store <4 x float> %div, ptr %res, align 16 + ret void +} + +define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrsqrt.d $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL one_div_sqrt_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrsqrte.d $vr1, $vr0 +; CHECK-NEXT: vfmul.d $vr1, $vr0, $vr1 +; CHECK-NEXT: vfmul.d $vr2, $vr0, $vr1 +; CHECK-NEXT: ori $a1, $zero, 0 +; CHECK-NEXT: lu32i.d $a1, -524288 +; CHECK-NEXT: lu52i.d $a1, $a1, -1024 +; CHECK-NEXT: vreplgr2vr.d $vr3, $a1 +; CHECK-NEXT: vfmadd.d $vr2, $vr2, $vr1, $vr3 +; CHECK-NEXT: lu52i.d $a1, $zero, -1026 +; CHECK-NEXT: vreplgr2vr.d $vr4, $a1 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr2 +; CHECK-NEXT: vfmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr0, $vr1, $vr3 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0, align 16 + %sqrt = call fast <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0) + %div = fdiv fast <2 x double> , %sqrt + store <2 x double> %div, ptr %res, align 16 + ret void +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) diff --git a/llvm/test/CodeGen/LoongArch/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/LoongArch/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..9bb449101683d6137695b8939965064b1a26f11c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/naked-fn-with-frame-pointer.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple loongarch32 -mattr +d | FileCheck %s -check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple loongarch64 -mattr +d | FileCheck %s -check-prefixes=CHECK-64 + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-LABEL: naked: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: bl main +; +; CHECK-64-LABEL: naked: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: bl main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-LABEL: normal: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: addi.w $sp, $sp, -16 +; CHECK-32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; CHECK-32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; CHECK-32-NEXT: .cfi_offset 1, -4 +; CHECK-32-NEXT: .cfi_offset 22, -8 +; CHECK-32-NEXT: addi.w $fp, $sp, 16 +; CHECK-32-NEXT: .cfi_def_cfa 22, 0 +; CHECK-32-NEXT: bl main +; +; CHECK-64-LABEL: normal: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: addi.d $sp, $sp, -16 +; CHECK-64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; CHECK-64-NEXT: .cfi_offset 1, -8 +; CHECK-64-NEXT: .cfi_offset 22, -16 +; CHECK-64-NEXT: addi.d $fp, $sp, 16 +; CHECK-64-NEXT: .cfi_def_cfa 22, 0 +; CHECK-64-NEXT: bl main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/M68k/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/M68k/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..807c52c39b6e6a6906902fe4d0e38be877e68d06 --- /dev/null +++ b/llvm/test/CodeGen/M68k/naked-fn-with-frame-pointer.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple m68k | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: jsr main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: link.w %a6, #0 +; CHECK-NEXT: .cfi_def_cfa_offset -8 +; CHECK-NEXT: .cfi_offset %a6, -8 +; CHECK-NEXT: .cfi_def_cfa_register %a6 +; CHECK-NEXT: jsr main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 51795a4fea515ec79ca272558d0c11bb4c4bcc6e..ebbb89b7816c584eddfad7550b37cf17b863d506 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -578,18 +578,3 @@ body: | SI_RETURN ... ---- -name: vregs -# FULL: registers: -# FULL-NEXT: - { id: 0, class: vgpr_32, preferred-register: '$vgpr1', flags: [ WWM_REG ] } -# FULL-NEXT: - { id: 1, class: sgpr_64, preferred-register: '$sgpr0_sgpr1', flags: [ ] } -# FULL-NEXT: - { id: 2, class: sgpr_64, preferred-register: '', flags: [ ] } -registers: - - { id: 0, class: vgpr_32, preferred-register: $vgpr1, flags: [ WWM_REG ]} - - { id: 1, class: sgpr_64, preferred-register: $sgpr0_sgpr1 } - - { id: 2, class: sgpr_64, flags: [ ] } -body: | - bb.0: - %2:sgpr_64 = COPY %1 - %1:sgpr_64 = COPY %0 -... diff --git a/llvm/test/CodeGen/MSP430/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/MSP430/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..2fdb01005bb280293211cbcfe901883e7dd2c744 --- /dev/null +++ b/llvm/test/CodeGen/MSP430/naked-fn-with-frame-pointer.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple msp430 | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: call #main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: push r4 +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: .cfi_offset r4, -4 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: .cfi_def_cfa_register r4 +; CHECK-NEXT: call #main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/Mips/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Mips/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..a3820da8b221c9ab21fbd1c862658414d699c855 --- /dev/null +++ b/llvm/test/CodeGen/Mips/naked-fn-with-frame-pointer.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple mips | FileCheck %s -check-prefixes=CHECK-32-BE +; RUN: llc < %s -mtriple mipsel | FileCheck %s -check-prefixes=CHECK-32-LE +; RUN: llc < %s -mtriple mips64 | FileCheck %s -check-prefixes=CHECK-64-BE +; RUN: llc < %s -mtriple mips64el | FileCheck %s -check-prefixes=CHECK-64-LE + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-BE-LABEL: naked: +; CHECK-32-BE: # %bb.0: +; CHECK-32-BE-NEXT: jal main +; CHECK-32-BE-NEXT: nop +; +; CHECK-32-LE-LABEL: naked: +; CHECK-32-LE: # %bb.0: +; CHECK-32-LE-NEXT: jal main +; CHECK-32-LE-NEXT: nop +; +; CHECK-64-BE-LABEL: naked: +; CHECK-64-BE: # %bb.0: +; CHECK-64-BE-NEXT: jal main +; CHECK-64-BE-NEXT: nop +; +; CHECK-64-LE-LABEL: naked: +; CHECK-64-LE: # %bb.0: +; CHECK-64-LE-NEXT: jal main +; CHECK-64-LE-NEXT: nop + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-BE-LABEL: normal: +; CHECK-32-BE: # %bb.0: +; CHECK-32-BE-NEXT: addiu $sp, $sp, -24 +; CHECK-32-BE-NEXT: .cfi_def_cfa_offset 24 +; CHECK-32-BE-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; CHECK-32-BE-NEXT: sw $fp, 16($sp) # 4-byte Folded Spill +; CHECK-32-BE-NEXT: .cfi_offset 31, -4 +; CHECK-32-BE-NEXT: .cfi_offset 30, -8 +; CHECK-32-BE-NEXT: move $fp, $sp +; CHECK-32-BE-NEXT: .cfi_def_cfa_register 30 +; CHECK-32-BE-NEXT: jal main +; CHECK-32-BE-NEXT: nop +; +; CHECK-32-LE-LABEL: normal: +; CHECK-32-LE: # %bb.0: +; CHECK-32-LE-NEXT: addiu $sp, $sp, -24 +; CHECK-32-LE-NEXT: .cfi_def_cfa_offset 24 +; CHECK-32-LE-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; CHECK-32-LE-NEXT: sw $fp, 16($sp) # 4-byte Folded Spill +; CHECK-32-LE-NEXT: .cfi_offset 31, -4 +; CHECK-32-LE-NEXT: .cfi_offset 30, -8 +; CHECK-32-LE-NEXT: move $fp, $sp +; CHECK-32-LE-NEXT: .cfi_def_cfa_register 30 +; CHECK-32-LE-NEXT: jal main +; CHECK-32-LE-NEXT: nop +; +; CHECK-64-BE-LABEL: normal: +; CHECK-64-BE: # %bb.0: +; CHECK-64-BE-NEXT: daddiu $sp, $sp, -16 +; CHECK-64-BE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-64-BE-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; CHECK-64-BE-NEXT: sd $fp, 0($sp) # 8-byte Folded Spill +; CHECK-64-BE-NEXT: .cfi_offset 31, -8 +; CHECK-64-BE-NEXT: .cfi_offset 30, -16 +; CHECK-64-BE-NEXT: move $fp, $sp +; CHECK-64-BE-NEXT: .cfi_def_cfa_register 30 +; CHECK-64-BE-NEXT: jal main +; CHECK-64-BE-NEXT: nop +; +; CHECK-64-LE-LABEL: normal: +; CHECK-64-LE: # %bb.0: +; CHECK-64-LE-NEXT: daddiu $sp, $sp, -16 +; CHECK-64-LE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-64-LE-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; CHECK-64-LE-NEXT: sd $fp, 0($sp) # 8-byte Folded Spill +; CHECK-64-LE-NEXT: .cfi_offset 31, -8 +; CHECK-64-LE-NEXT: .cfi_offset 30, -16 +; CHECK-64-LE-NEXT: move $fp, $sp +; CHECK-64-LE-NEXT: .cfi_def_cfa_register 30 +; CHECK-64-LE-NEXT: jal main +; CHECK-64-LE-NEXT: nop + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index b203a78d677308e93f0a02100209209111972797..33fa3afc94b89dea8f4cfd463a12df020992804c 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -2,6 +2,72 @@ ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT ; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX +%struct.uint4 = type { i32, i32, i32, i32 } + +@gi = dso_local addrspace(1) externally_initialized global %struct.uint4 { i32 50462976, i32 117835012, i32 185207048, i32 252579084 }, align 16 + +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; Regular functions mus still make a copy. `cvta.param` does not always work there. +define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly byval(%struct.uint4) align 16 %a, i1 noundef zeroext %b, i32 noundef %c) local_unnamed_addr #0 { +; OPT-LABEL: define dso_local noundef i32 @non_kernel_function( +; OPT-SAME: ptr nocapture noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[A1:%.*]] = alloca [[STRUCT_UINT4]], align 16 +; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(101) +; OPT-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 16 [[A1]], ptr addrspace(101) align 16 [[A2]], i64 16, i1 false) +; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr) +; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64 +; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]] +; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1 +; OPT-NEXT: ret i32 [[TMP0]] +; +; PTX-LABEL: non_kernel_function( +; PTX: { +; PTX-NEXT: .local .align 16 .b8 __local_depot0[16]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<11>; +; PTX-NEXT: .reg .b64 %rd<10>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.u64 %SPL, __local_depot0; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u8 %rs1, [non_kernel_function_param_1]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX-NEXT: ld.param.s32 %rd1, [non_kernel_function_param_2]; +; PTX-NEXT: add.u64 %rd2, %SP, 0; +; PTX-NEXT: or.b64 %rd3, %rd2, 8; +; PTX-NEXT: ld.param.u64 %rd4, [non_kernel_function_param_0+8]; +; PTX-NEXT: st.u64 [%rd3], %rd4; +; PTX-NEXT: ld.param.u64 %rd5, [non_kernel_function_param_0]; +; PTX-NEXT: st.u64 [%SP+0], %rd5; +; PTX-NEXT: mov.u64 %rd6, gi; +; PTX-NEXT: cvta.global.u64 %rd7, %rd6; +; PTX-NEXT: selp.b64 %rd8, %rd2, %rd7, %p1; +; PTX-NEXT: add.s64 %rd9, %rd8, %rd1; +; PTX-NEXT: ld.u8 %r1, [%rd9]; +; PTX-NEXT: ld.u8 %r2, [%rd9+1]; +; PTX-NEXT: shl.b32 %r3, %r2, 8; +; PTX-NEXT: or.b32 %r4, %r3, %r1; +; PTX-NEXT: ld.u8 %r5, [%rd9+2]; +; PTX-NEXT: shl.b32 %r6, %r5, 16; +; PTX-NEXT: ld.u8 %r7, [%rd9+3]; +; PTX-NEXT: shl.b32 %r8, %r7, 24; +; PTX-NEXT: or.b32 %r9, %r8, %r6; +; PTX-NEXT: or.b32 %r10, %r9, %r4; +; PTX-NEXT: st.param.b32 [func_retval0+0], %r10; +; PTX-NEXT: ret; +entry: + %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17 + %idx.ext = sext i32 %c to i64, !dbg !18 + %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18 + %0 = load i32, ptr %add.ptr, align 1, !dbg !19 + ret i32 %0, !dbg !23 +} + define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { ; PTX-LABEL: grid_const_int( ; PTX: { @@ -17,7 +83,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou ; PTX-NEXT: st.global.u32 [%rd2], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_int( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) ; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr ; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) @@ -106,14 +172,14 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { ; PTX-LABEL: multiple_grid_const_escape( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot3[4]; +; PTX-NEXT: .local .align 4 .b8 __local_depot4[4]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<4>; ; PTX-NEXT: .reg .b64 %rd<10>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: mov.u64 %SPL, __local_depot3; +; PTX-NEXT: mov.u64 %SPL, __local_depot4; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0; ; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2; @@ -342,10 +408,10 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) { ; PTX-NEXT: cvta.param.u64 %rd8, %rd7; ; PTX-NEXT: ld.global.u32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; -; PTX-NEXT: @%p1 bra $L__BB8_2; +; PTX-NEXT: @%p1 bra $L__BB9_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: add.s64 %rd8, %rd8, 4; -; PTX-NEXT: $L__BB8_2: // %merge +; PTX-NEXT: $L__BB9_2: // %merge ; PTX-NEXT: ld.u32 %r2, [%rd8]; ; PTX-NEXT: st.global.u32 [%rd1], %r2; ; PTX-NEXT: ret; @@ -402,13 +468,13 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval( ; PTX-NEXT: cvta.param.u64 %rd11, %rd10; ; PTX-NEXT: ld.global.u32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; -; PTX-NEXT: @%p1 bra $L__BB9_2; +; PTX-NEXT: @%p1 bra $L__BB10_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1; ; PTX-NEXT: mov.u64 %rd9, %rd8; ; PTX-NEXT: cvta.param.u64 %rd2, %rd9; ; PTX-NEXT: add.s64 %rd11, %rd2, 4; -; PTX-NEXT: $L__BB9_2: // %merge +; PTX-NEXT: $L__BB10_2: // %merge ; PTX-NEXT: ld.u32 %r2, [%rd11]; ; PTX-NEXT: st.global.u32 [%rd1], %r2; ; PTX-NEXT: ret; @@ -567,3 +633,5 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr !22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23} !23 = !{i32 1} + + diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index a7dbc4c1620a5f68ccd509137abdf69a8c5e75b6..013694277039f61c997ab76117cc9935efe3f3ea 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -219,6 +219,22 @@ entry: ret void } +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @memcpy_from_param_noalign( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; COMMON-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) + ret void +} + ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { ; COMMON-LABEL: define dso_local void @memcpy_to_param( @@ -426,7 +442,7 @@ attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } !llvm.module.flags = !{!0, !1, !2, !3} -!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19} +!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !23} !llvm.ident = !{!20, !21} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} @@ -451,3 +467,4 @@ attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } !19 = !{ptr @test_select_write, !"kernel", i32 1} !20 = !{!"clang version 20.0.0git"} !21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!23 = !{ptr @memcpy_from_param_noalign, !"kernel", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..a1f0577c2218bd7bc6699e4e402a36eb67adb413 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple nvptx | FileCheck %s -check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple nvptx64 | FileCheck %s -check-prefixes=CHECK-64 + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-LABEL: naked( +; CHECK-32: { +; CHECK-32-EMPTY: +; CHECK-32-EMPTY: +; CHECK-32-NEXT: // %bb.0: +; CHECK-32-NEXT: { // callseq 0, 0 +; CHECK-32-NEXT: call.uni +; CHECK-32-NEXT: main, +; CHECK-32-NEXT: ( +; CHECK-32-NEXT: ); +; CHECK-32-NEXT: } // callseq 0 +; CHECK-32-NEXT: // begin inline asm +; CHECK-32-NEXT: exit; +; CHECK-32-NEXT: // end inline asm +; +; CHECK-64-LABEL: naked( +; CHECK-64: { +; CHECK-64-EMPTY: +; CHECK-64-EMPTY: +; CHECK-64-NEXT: // %bb.0: +; CHECK-64-NEXT: { // callseq 0, 0 +; CHECK-64-NEXT: call.uni +; CHECK-64-NEXT: main, +; CHECK-64-NEXT: ( +; CHECK-64-NEXT: ); +; CHECK-64-NEXT: } // callseq 0 +; CHECK-64-NEXT: // begin inline asm +; CHECK-64-NEXT: exit; +; CHECK-64-NEXT: // end inline asm + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-LABEL: normal( +; CHECK-32: { +; CHECK-32-EMPTY: +; CHECK-32-EMPTY: +; CHECK-32-NEXT: // %bb.0: +; CHECK-32-NEXT: { // callseq 1, 0 +; CHECK-32-NEXT: call.uni +; CHECK-32-NEXT: main, +; CHECK-32-NEXT: ( +; CHECK-32-NEXT: ); +; CHECK-32-NEXT: } // callseq 1 +; CHECK-32-NEXT: // begin inline asm +; CHECK-32-NEXT: exit; +; CHECK-32-NEXT: // end inline asm +; +; CHECK-64-LABEL: normal( +; CHECK-64: { +; CHECK-64-EMPTY: +; CHECK-64-EMPTY: +; CHECK-64-NEXT: // %bb.0: +; CHECK-64-NEXT: { // callseq 1, 0 +; CHECK-64-NEXT: call.uni +; CHECK-64-NEXT: main, +; CHECK-64-NEXT: ( +; CHECK-64-NEXT: ); +; CHECK-64-NEXT: } // callseq 1 +; CHECK-64-NEXT: // begin inline asm +; CHECK-64-NEXT: exit; +; CHECK-64-NEXT: // end inline asm + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir index 99a3f80ff81b4d025dc4c4530c84bc3b52279d2d..794480bfc6cec1b24c3fe84bdce1fc346c30e3ee 100644 --- a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir +++ b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-machineinstrs \ # RUN: -run-pass=early-ifcvt %s -o - | FileCheck %s +# RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-each \ +# RUN: -passes=early-ifcvt %s -o - | FileCheck %s --- | source_filename = "" diff --git a/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll new file mode 100644 index 0000000000000000000000000000000000000000..4710d5c14e5b190d6722f63410a3f1d06b980683 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll @@ -0,0 +1,138 @@ +; Tests if the __llvm_gcov_ctr section contains a .ref pseudo-op +; referring to the __llvm_covinit section. +; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-RW %s +; RUN: llc -mxcoff-roptr < %s | FileCheck --check-prefixes=CHECK,CHECK-RO %s + +target datalayout = "E-m:a-p:32:32-Fi32-i64:64-n32" +target triple = "powerpc-ibm-aix" + +; CHECK-RW: .csect __llvm_covinit[RW],3 +; CHECK-RO: .csect __llvm_covinit[RO],3 +; CHECK-NEXT: .align 3 # @__llvm_covinit_functions +; CHECK-NEXT: L..__llvm_covinit_functions: +; CHECK-NEXT: .vbyte 4, __llvm_gcov_writeout[DS] +; CHECK-NEXT: .vbyte 4, __llvm_gcov_reset[DS] +; CHECK: .csect __llvm_gcov_ctr_section[RW],3 +; CHECK-NEXT: .lglobl __llvm_gcov_ctr # @_MergedGlobals +; CHECK-NEXT: .lglobl __llvm_gcov_ctr.1 +; CHECK-NEXT: .align 3 +; CHECK-NEXT: L.._MergedGlobals: +; CHECK-NEXT: __llvm_gcov_ctr: +; CHECK-NEXT: .space 16 +; CHECK-NEXT: __llvm_gcov_ctr.1: +; CHECK-NEXT: .extern .llvm_gcda_start_file[PR] +; CHECK-NEXT: .extern .llvm_gcda_emit_function[PR] +; CHECK-NEXT: .extern .llvm_gcda_emit_arcs[PR] +; CHECK-NEXT: .extern .llvm_gcda_summary_info[PR] +; CHECK-NEXT: .extern .llvm_gcda_end_file[PR] +; CHECK-RW-NEXT: .ref __llvm_covinit[RW] +; CHECK-RO-NEXT: .ref __llvm_covinit[RO] + +%emit_function_args_ty = type { i32, i32, i32 } +%emit_arcs_args_ty = type { i32, ptr } +%file_info = type { %start_file_args_ty, i32, ptr, ptr } +%start_file_args_ty = type { ptr, i32, i32 } + +@__llvm_gcov_ctr = internal global [1 x i64] zeroinitializer, section "__llvm_gcov_ctr_section" +@__llvm_gcov_ctr.1 = internal global [1 x i64] zeroinitializer, section "__llvm_gcov_ctr_section" +@0 = private unnamed_addr constant [10 x i8] c"test.gcda\00", align 1 +@__llvm_internal_gcov_emit_function_args.0 = internal unnamed_addr constant [2 x %emit_function_args_ty] [%emit_function_args_ty { i32 0, i32 1961870044, i32 -801444649 }, %emit_function_args_ty { i32 1, i32 1795396728, i32 -801444649 }] +@__llvm_internal_gcov_emit_arcs_args.0 = internal unnamed_addr constant [2 x %emit_arcs_args_ty] [%emit_arcs_args_ty { i32 1, ptr @__llvm_gcov_ctr }, %emit_arcs_args_ty { i32 1, ptr @__llvm_gcov_ctr.1 }] +@__llvm_internal_gcov_emit_file_info = internal unnamed_addr constant [1 x %file_info] [%file_info { %start_file_args_ty { ptr @0, i32 875575338, i32 -801444649 }, i32 2, ptr @__llvm_internal_gcov_emit_function_args.0, ptr @__llvm_internal_gcov_emit_arcs_args.0 }] +@__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit", align 8 + +define i32 @bar() { +entry: + %gcov_ctr = load i64, ptr @__llvm_gcov_ctr, align 8 + %0 = add i64 %gcov_ctr, 1 + store i64 %0, ptr @__llvm_gcov_ctr, align 8 + ret i32 1 +} + +define i32 @main() { +entry: + %gcov_ctr = load i64, ptr @__llvm_gcov_ctr.1, align 8 + %0 = add i64 %gcov_ctr, 1 + store i64 %0, ptr @__llvm_gcov_ctr.1, align 8 + %retval = alloca i32, align 4 + store i32 0, ptr %retval, align 4 + %call = call i32 @bar() + %sub = sub nsw i32 %call, 1 + ret i32 %sub +} + +define internal void @__llvm_gcov_writeout() unnamed_addr { +entry: + br label %file.loop.header + +file.loop.header: ; preds = %file.loop.latch, %entry + %file_idx = phi i32 [ 0, %entry ], [ %next_file_idx, %file.loop.latch ] + %0 = getelementptr inbounds [1 x %file_info], ptr @__llvm_internal_gcov_emit_file_info, i32 0, i32 %file_idx + %start_file_args = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 0 + %1 = getelementptr inbounds nuw %start_file_args_ty, ptr %start_file_args, i32 0, i32 0 + %filename = load ptr, ptr %1, align 4 + %2 = getelementptr inbounds nuw %start_file_args_ty, ptr %start_file_args, i32 0, i32 1 + %version = load i32, ptr %2, align 4 + %3 = getelementptr inbounds nuw %start_file_args_ty, ptr %start_file_args, i32 0, i32 2 + %stamp = load i32, ptr %3, align 4 + call void @llvm_gcda_start_file(ptr %filename, i32 %version, i32 %stamp) + %4 = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 1 + %num_ctrs = load i32, ptr %4, align 4 + %5 = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 2 + %emit_function_args = load ptr, ptr %5, align 4 + %6 = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 3 + %emit_arcs_args = load ptr, ptr %6, align 4 + %7 = icmp slt i32 0, %num_ctrs + br i1 %7, label %counter.loop.header, label %file.loop.latch + +counter.loop.header: ; preds = %counter.loop.header, %file.loop.header + %ctr_idx = phi i32 [ 0, %file.loop.header ], [ %15, %counter.loop.header ] + %8 = getelementptr inbounds %emit_function_args_ty, ptr %emit_function_args, i32 %ctr_idx + %9 = getelementptr inbounds nuw %emit_function_args_ty, ptr %8, i32 0, i32 0 + %ident = load i32, ptr %9, align 4 + %10 = getelementptr inbounds nuw %emit_function_args_ty, ptr %8, i32 0, i32 1 + %func_checkssum = load i32, ptr %10, align 4 + %11 = getelementptr inbounds nuw %emit_function_args_ty, ptr %8, i32 0, i32 2 + %cfg_checksum = load i32, ptr %11, align 4 + call void @llvm_gcda_emit_function(i32 %ident, i32 %func_checkssum, i32 %cfg_checksum) + %12 = getelementptr inbounds %emit_arcs_args_ty, ptr %emit_arcs_args, i32 %ctr_idx + %13 = getelementptr inbounds nuw %emit_arcs_args_ty, ptr %12, i32 0, i32 0 + %num_counters = load i32, ptr %13, align 4 + %14 = getelementptr inbounds nuw %emit_arcs_args_ty, ptr %12, i32 0, i32 1 + %counters = load ptr, ptr %14, align 4 + call void @llvm_gcda_emit_arcs(i32 %num_counters, ptr %counters) + %15 = add i32 %ctr_idx, 1 + %16 = icmp slt i32 %15, %num_ctrs + br i1 %16, label %counter.loop.header, label %file.loop.latch + +file.loop.latch: ; preds = %counter.loop.header, %file.loop.header + call void @llvm_gcda_summary_info() + call void @llvm_gcda_end_file() + %next_file_idx = add i32 %file_idx, 1 + %17 = icmp slt i32 %next_file_idx, 1 + br i1 %17, label %file.loop.header, label %exit + +exit: ; preds = %file.loop.latch + ret void +} + +declare void @llvm_gcda_start_file(ptr, i32, i32) + +declare void @llvm_gcda_emit_function(i32, i32, i32) + +declare void @llvm_gcda_emit_arcs(i32, ptr) + +declare void @llvm_gcda_summary_info() + +declare void @llvm_gcda_end_file() + +define internal void @__llvm_gcov_reset() unnamed_addr { +entry: + call void @llvm.memset.p0.i64(ptr @__llvm_gcov_ctr, i8 0, i64 8, i1 false) + call void @llvm.memset.p0.i64(ptr @__llvm_gcov_ctr.1, i8 0, i64 8, i1 false) + ret void +} + +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) + + diff --git a/llvm/test/CodeGen/PowerPC/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/PowerPC/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..59b1044084c645461ab8f2bf92797ca51282221f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/naked-fn-with-frame-pointer.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple powerpc | FileCheck %s -check-prefixes=CHECK-32-BE +; RUN: llc < %s -mtriple powerpcle | FileCheck %s -check-prefixes=CHECK-32-LE +; RUN: llc < %s -mtriple powerpc64 | FileCheck %s -check-prefixes=CHECK-64-BE +; RUN: llc < %s -mtriple powerpc64le | FileCheck %s -check-prefixes=CHECK-64-LE + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-BE-LABEL: naked: +; CHECK-32-BE: # %bb.0: +; CHECK-32-BE-NEXT: bl main +; +; CHECK-32-LE-LABEL: naked: +; CHECK-32-LE: # %bb.0: +; CHECK-32-LE-NEXT: bl main +; +; CHECK-64-BE-LABEL: naked: +; CHECK-64-BE: # %bb.0: +; CHECK-64-BE-NEXT: bl main +; CHECK-64-BE-NEXT: nop +; +; CHECK-64-LE-LABEL: naked: +; CHECK-64-LE: # %bb.0: +; CHECK-64-LE-NEXT: bl main +; CHECK-64-LE-NEXT: nop + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-BE-LABEL: normal: +; CHECK-32-BE: # %bb.0: +; CHECK-32-BE-NEXT: mflr 0 +; CHECK-32-BE-NEXT: stwu 1, -16(1) +; CHECK-32-BE-NEXT: stw 31, 12(1) +; CHECK-32-BE-NEXT: stw 0, 20(1) +; CHECK-32-BE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-32-BE-NEXT: .cfi_offset r31, -4 +; CHECK-32-BE-NEXT: .cfi_offset lr, 4 +; CHECK-32-BE-NEXT: mr 31, 1 +; CHECK-32-BE-NEXT: .cfi_def_cfa_register r31 +; CHECK-32-BE-NEXT: bl main +; +; CHECK-32-LE-LABEL: normal: +; CHECK-32-LE: # %bb.0: +; CHECK-32-LE-NEXT: mflr 0 +; CHECK-32-LE-NEXT: stwu 1, -16(1) +; CHECK-32-LE-NEXT: stw 31, 12(1) +; CHECK-32-LE-NEXT: stw 0, 20(1) +; CHECK-32-LE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-32-LE-NEXT: .cfi_offset r31, -4 +; CHECK-32-LE-NEXT: .cfi_offset lr, 4 +; CHECK-32-LE-NEXT: mr 31, 1 +; CHECK-32-LE-NEXT: .cfi_def_cfa_register r31 +; CHECK-32-LE-NEXT: bl main +; +; CHECK-64-BE-LABEL: normal: +; CHECK-64-BE: # %bb.0: +; CHECK-64-BE-NEXT: mflr 0 +; CHECK-64-BE-NEXT: std 31, -8(1) +; CHECK-64-BE-NEXT: stdu 1, -128(1) +; CHECK-64-BE-NEXT: std 0, 144(1) +; CHECK-64-BE-NEXT: .cfi_def_cfa_offset 128 +; CHECK-64-BE-NEXT: .cfi_offset r31, -8 +; CHECK-64-BE-NEXT: .cfi_offset lr, 16 +; CHECK-64-BE-NEXT: mr 31, 1 +; CHECK-64-BE-NEXT: .cfi_def_cfa_register r31 +; CHECK-64-BE-NEXT: bl main +; CHECK-64-BE-NEXT: nop +; +; CHECK-64-LE-LABEL: normal: +; CHECK-64-LE: # %bb.0: +; CHECK-64-LE-NEXT: mflr 0 +; CHECK-64-LE-NEXT: std 31, -8(1) +; CHECK-64-LE-NEXT: stdu 1, -48(1) +; CHECK-64-LE-NEXT: std 0, 64(1) +; CHECK-64-LE-NEXT: .cfi_def_cfa_offset 48 +; CHECK-64-LE-NEXT: .cfi_offset r31, -8 +; CHECK-64-LE-NEXT: .cfi_offset lr, 16 +; CHECK-64-LE-NEXT: mr 31, 1 +; CHECK-64-LE-NEXT: .cfi_def_cfa_register r31 +; CHECK-64-LE-NEXT: bl main +; CHECK-64-LE-NEXT: nop + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll index e950c0a2efac492578b88efc384cb146566405bb..2be370f638d5bd11a52306f0ce245b9b0e8f1e79 100644 --- a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll +++ b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll @@ -214,6 +214,48 @@ entry: declare i64 @llvm.lround.i64.f64(double) +define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr { +; BE-LABEL: test_lroundi32f64: +; BE: # %bb.0: # %entry +; BE-NEXT: mflr r0 +; BE-NEXT: stdu r1, -112(r1) +; BE-NEXT: std r0, 128(r1) +; BE-NEXT: .cfi_def_cfa_offset 112 +; BE-NEXT: .cfi_offset lr, 16 +; BE-NEXT: bl lround +; BE-NEXT: nop +; BE-NEXT: addi r1, r1, 112 +; BE-NEXT: ld r0, 16(r1) +; BE-NEXT: mtlr r0 +; BE-NEXT: blr +; +; CHECK-LABEL: test_lroundi32f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: bl lround +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +; +; FAST-LABEL: test_lroundi32f64: +; FAST: # %bb.0: # %entry +; FAST-NEXT: xsrdpi f0, f1 +; FAST-NEXT: fctiw f0, f0 +; FAST-NEXT: mffprwz r3, f0 +; FAST-NEXT: blr +entry: + %0 = tail call i32 @llvm.lround.i32.f64(double %d) + ret i32 %0 +} + +declare i32 @llvm.lround.i32.f64(double) + define dso_local i64 @test_lroundf(float %f) local_unnamed_addr { ; BE-LABEL: test_lroundf: ; BE: # %bb.0: # %entry @@ -256,6 +298,48 @@ entry: declare i64 @llvm.lround.i64.f32(float) +define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr { +; BE-LABEL: test_lroundi32f32: +; BE: # %bb.0: # %entry +; BE-NEXT: mflr r0 +; BE-NEXT: stdu r1, -112(r1) +; BE-NEXT: std r0, 128(r1) +; BE-NEXT: .cfi_def_cfa_offset 112 +; BE-NEXT: .cfi_offset lr, 16 +; BE-NEXT: bl lroundf +; BE-NEXT: nop +; BE-NEXT: addi r1, r1, 112 +; BE-NEXT: ld r0, 16(r1) +; BE-NEXT: mtlr r0 +; BE-NEXT: blr +; +; CHECK-LABEL: test_lroundi32f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: bl lroundf +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +; +; FAST-LABEL: test_lroundi32f32: +; FAST: # %bb.0: # %entry +; FAST-NEXT: xsrdpi f0, f1 +; FAST-NEXT: fctiw f0, f0 +; FAST-NEXT: mffprwz r3, f0 +; FAST-NEXT: blr +entry: + %0 = tail call i32 @llvm.lround.i32.f32(float %d) + ret i32 %0 +} + +declare i32 @llvm.lround.i32.f32(float) + define dso_local i64 @test_llround(double %d) local_unnamed_addr { ; BE-LABEL: test_llround: ; BE: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/PowerPC/stack-guard-global.ll b/llvm/test/CodeGen/PowerPC/stack-guard-global.ll new file mode 100644 index 0000000000000000000000000000000000000000..022a62a4b0918d2cd4d36c374c889802275920b8 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/stack-guard-global.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=powerpc64 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=BE64 +; RUN: llc -mtriple=powerpc64le -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=LE64 +; RUN: llc -mtriple=ppc32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=LE32 + +define void @foo(i64 %t) sspstrong nounwind { +; BE64-LABEL: foo: +; BE64: # %bb.0: +; BE64-NEXT: mflr 0 +; BE64-NEXT: std 31, -8(1) +; BE64-NEXT: stdu 1, -144(1) +; BE64-NEXT: mr 31, 1 +; BE64-NEXT: std 0, 160(1) +; BE64-NEXT: std 30, 128(31) # 8-byte Folded Spill +; BE64-NEXT: addis 30, 2, __stack_chk_guard@toc@ha +; BE64-NEXT: sldi 3, 3, 2 +; BE64-NEXT: ld 4, __stack_chk_guard@toc@l(30) +; BE64-NEXT: addi 3, 3, 15 +; BE64-NEXT: rldicr 3, 3, 0, 59 +; BE64-NEXT: neg 3, 3 +; BE64-NEXT: std 4, 120(31) +; BE64-NEXT: addi 4, 31, 144 +; BE64-NEXT: stdux 4, 1, 3 +; BE64-NEXT: addi 3, 1, 112 +; BE64-NEXT: bl baz +; BE64-NEXT: nop +; BE64-NEXT: ld 3, __stack_chk_guard@toc@l(30) +; BE64-NEXT: ld 4, 120(31) +; BE64-NEXT: cmpld 3, 4 +; BE64-NEXT: bne 0, .LBB0_2 +; BE64-NEXT: # %bb.1: +; BE64-NEXT: ld 30, 128(31) # 8-byte Folded Reload +; BE64-NEXT: ld 1, 0(1) +; BE64-NEXT: ld 0, 16(1) +; BE64-NEXT: ld 31, -8(1) +; BE64-NEXT: mtlr 0 +; BE64-NEXT: blr +; BE64-NEXT: .LBB0_2: +; BE64-NEXT: bl __stack_chk_fail +; BE64-NEXT: nop +; +; LE64-LABEL: foo: +; LE64: # %bb.0: +; LE64-NEXT: mflr 0 +; LE64-NEXT: std 31, -8(1) +; LE64-NEXT: stdu 1, -64(1) +; LE64-NEXT: mr 31, 1 +; LE64-NEXT: sldi 3, 3, 2 +; LE64-NEXT: std 0, 80(1) +; LE64-NEXT: std 30, 48(31) # 8-byte Folded Spill +; LE64-NEXT: addis 30, 2, __stack_chk_guard@toc@ha +; LE64-NEXT: addi 3, 3, 15 +; LE64-NEXT: ld 4, __stack_chk_guard@toc@l(30) +; LE64-NEXT: rldicr 3, 3, 0, 59 +; LE64-NEXT: neg 3, 3 +; LE64-NEXT: std 4, 40(31) +; LE64-NEXT: addi 4, 31, 64 +; LE64-NEXT: stdux 4, 1, 3 +; LE64-NEXT: addi 3, 1, 32 +; LE64-NEXT: bl baz +; LE64-NEXT: nop +; LE64-NEXT: ld 3, __stack_chk_guard@toc@l(30) +; LE64-NEXT: ld 4, 40(31) +; LE64-NEXT: cmpld 3, 4 +; LE64-NEXT: bne 0, .LBB0_2 +; LE64-NEXT: # %bb.1: +; LE64-NEXT: ld 30, 48(31) # 8-byte Folded Reload +; LE64-NEXT: ld 1, 0(1) +; LE64-NEXT: ld 0, 16(1) +; LE64-NEXT: ld 31, -8(1) +; LE64-NEXT: mtlr 0 +; LE64-NEXT: blr +; LE64-NEXT: .LBB0_2: +; LE64-NEXT: bl __stack_chk_fail +; LE64-NEXT: nop +; +; LE32-LABEL: foo: +; LE32: # %bb.0: +; LE32-NEXT: mflr 0 +; LE32-NEXT: stwu 1, -32(1) +; LE32-NEXT: stw 31, 28(1) +; LE32-NEXT: mr 31, 1 +; LE32-NEXT: stw 0, 36(1) +; LE32-NEXT: slwi 4, 4, 2 +; LE32-NEXT: stw 30, 24(31) # 4-byte Folded Spill +; LE32-NEXT: lis 30, __stack_chk_guard@ha +; LE32-NEXT: lwz 3, __stack_chk_guard@l(30) +; LE32-NEXT: addi 4, 4, 15 +; LE32-NEXT: rlwinm 4, 4, 0, 0, 27 +; LE32-NEXT: neg 4, 4 +; LE32-NEXT: stw 3, 20(31) +; LE32-NEXT: addi 3, 31, 32 +; LE32-NEXT: stwux 3, 1, 4 +; LE32-NEXT: addi 3, 1, 16 +; LE32-NEXT: bl baz +; LE32-NEXT: lwz 3, __stack_chk_guard@l(30) +; LE32-NEXT: lwz 4, 20(31) +; LE32-NEXT: cmplw 3, 4 +; LE32-NEXT: bne 0, .LBB0_2 +; LE32-NEXT: # %bb.1: +; LE32-NEXT: lwz 30, 24(31) # 4-byte Folded Reload +; LE32-NEXT: lwz 31, 0(1) +; LE32-NEXT: lwz 0, -4(31) +; LE32-NEXT: mr 1, 31 +; LE32-NEXT: mr 31, 0 +; LE32-NEXT: lwz 0, 4(1) +; LE32-NEXT: mtlr 0 +; LE32-NEXT: blr +; LE32-NEXT: .LBB0_2: +; LE32-NEXT: bl __stack_chk_fail + %vla = alloca i32, i64 %t, align 4 + call void @baz(ptr %vla) + ret void +} + +declare void @baz(ptr) + +!llvm.module.flags = !{!1} +!1 = !{i32 2, !"stack-protector-guard", !"global"} diff --git a/llvm/test/CodeGen/PowerPC/stack-guard-tls.ll b/llvm/test/CodeGen/PowerPC/stack-guard-tls.ll new file mode 100644 index 0000000000000000000000000000000000000000..de0becc037309f9a8552fa32be16cb5c7c9af600 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/stack-guard-tls.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=powerpc64 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=BE64 +; RUN: llc -mtriple=powerpc64le -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=LE64 +; RUN: llc -mtriple=ppc32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=LE32 + +define void @foo(i64 %t) sspstrong nounwind { +; BE64-LABEL: foo: +; BE64: # %bb.0: +; BE64-NEXT: mflr 0 +; BE64-NEXT: std 31, -8(1) +; BE64-NEXT: stdu 1, -144(1) +; BE64-NEXT: ld 4, 500(13) +; BE64-NEXT: sldi 3, 3, 2 +; BE64-NEXT: mr 31, 1 +; BE64-NEXT: addi 3, 3, 15 +; BE64-NEXT: rldicr 3, 3, 0, 59 +; BE64-NEXT: std 0, 160(1) +; BE64-NEXT: neg 3, 3 +; BE64-NEXT: std 4, 128(31) +; BE64-NEXT: addi 4, 31, 144 +; BE64-NEXT: stdux 4, 1, 3 +; BE64-NEXT: addi 3, 1, 112 +; BE64-NEXT: bl baz +; BE64-NEXT: nop +; BE64-NEXT: ld 3, 128(31) +; BE64-NEXT: ld 4, 500(13) +; BE64-NEXT: cmpld 4, 3 +; BE64-NEXT: bne 0, .LBB0_2 +; BE64-NEXT: # %bb.1: +; BE64-NEXT: ld 1, 0(1) +; BE64-NEXT: ld 0, 16(1) +; BE64-NEXT: ld 31, -8(1) +; BE64-NEXT: mtlr 0 +; BE64-NEXT: blr +; BE64-NEXT: .LBB0_2: +; BE64-NEXT: bl __stack_chk_fail +; BE64-NEXT: nop +; +; LE64-LABEL: foo: +; LE64: # %bb.0: +; LE64-NEXT: mflr 0 +; LE64-NEXT: std 31, -8(1) +; LE64-NEXT: stdu 1, -64(1) +; LE64-NEXT: sldi 3, 3, 2 +; LE64-NEXT: ld 4, 500(13) +; LE64-NEXT: std 0, 80(1) +; LE64-NEXT: addi 3, 3, 15 +; LE64-NEXT: mr 31, 1 +; LE64-NEXT: std 4, 48(31) +; LE64-NEXT: addi 4, 31, 64 +; LE64-NEXT: rldicr 3, 3, 0, 59 +; LE64-NEXT: neg 3, 3 +; LE64-NEXT: stdux 4, 1, 3 +; LE64-NEXT: addi 3, 1, 32 +; LE64-NEXT: bl baz +; LE64-NEXT: nop +; LE64-NEXT: ld 3, 48(31) +; LE64-NEXT: ld 4, 500(13) +; LE64-NEXT: cmpld 4, 3 +; LE64-NEXT: bne 0, .LBB0_2 +; LE64-NEXT: # %bb.1: +; LE64-NEXT: ld 1, 0(1) +; LE64-NEXT: ld 0, 16(1) +; LE64-NEXT: ld 31, -8(1) +; LE64-NEXT: mtlr 0 +; LE64-NEXT: blr +; LE64-NEXT: .LBB0_2: +; LE64-NEXT: bl __stack_chk_fail +; LE64-NEXT: nop +; +; LE32-LABEL: foo: +; LE32: # %bb.0: +; LE32-NEXT: mflr 0 +; LE32-NEXT: stwu 1, -32(1) +; LE32-NEXT: lwz 3, 500(2) +; LE32-NEXT: slwi 4, 4, 2 +; LE32-NEXT: addi 4, 4, 15 +; LE32-NEXT: stw 31, 28(1) +; LE32-NEXT: mr 31, 1 +; LE32-NEXT: rlwinm 4, 4, 0, 0, 27 +; LE32-NEXT: stw 0, 36(1) +; LE32-NEXT: neg 4, 4 +; LE32-NEXT: stw 3, 24(31) +; LE32-NEXT: addi 3, 31, 32 +; LE32-NEXT: stwux 3, 1, 4 +; LE32-NEXT: addi 3, 1, 16 +; LE32-NEXT: bl baz +; LE32-NEXT: lwz 3, 24(31) +; LE32-NEXT: lwz 4, 500(2) +; LE32-NEXT: cmplw 4, 3 +; LE32-NEXT: bne 0, .LBB0_2 +; LE32-NEXT: # %bb.1: +; LE32-NEXT: lwz 31, 0(1) +; LE32-NEXT: lwz 0, -4(31) +; LE32-NEXT: mr 1, 31 +; LE32-NEXT: mr 31, 0 +; LE32-NEXT: lwz 0, 4(1) +; LE32-NEXT: mtlr 0 +; LE32-NEXT: blr +; LE32-NEXT: .LBB0_2: +; LE32-NEXT: bl __stack_chk_fail + %vla = alloca i32, i64 %t, align 4 + call void @baz(ptr %vla) + ret void +} + +declare void @baz(ptr) + +!llvm.module.flags = !{!1, !2} +!1 = !{i32 2, !"stack-protector-guard", !"tls"} +!2 = !{i32 2, !"stack-protector-guard-offset", i32 500} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir index b75e926bb50c4e68527ccf136b2ea3040bf6528e..50b96e0ee972e67d990b65dcee8e43884fc14a69 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir @@ -117,7 +117,7 @@ body: | ; CHECK: liveins: $x10 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; CHECK-NEXT: [[RORIW:%[0-9]+]]:gpr = RORIW [[COPY]], 15 + ; CHECK-NEXT: [[RORIW:%[0-9]+]]:gpr = RORIW [[COPY]], 17 ; CHECK-NEXT: $x10 = COPY [[RORIW]] ; CHECK-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 @@ -165,9 +165,8 @@ body: | ; CHECK: liveins: $x10 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 15 - ; CHECK-NEXT: [[RORW:%[0-9]+]]:gpr = RORW [[COPY]], [[ADDI]] - ; CHECK-NEXT: $x10 = COPY [[RORW]] + ; CHECK-NEXT: [[RORIW:%[0-9]+]]:gpr = RORIW [[COPY]], 15 + ; CHECK-NEXT: $x10 = COPY [[RORIW]] ; CHECK-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 %1:gprb(s32) = G_TRUNC %0(s64) diff --git a/llvm/test/CodeGen/RISCV/epi-trampoline.ll b/llvm/test/CodeGen/RISCV/epi-trampoline.ll deleted file mode 100644 index 44f7b0a2d456fee3a9e41426c1d5e8823b6193e8..0000000000000000000000000000000000000000 --- a/llvm/test/CodeGen/RISCV/epi-trampoline.ll +++ /dev/null @@ -1,50 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefix=RV64 %s - -declare void @llvm.init.trampoline(ptr, ptr, ptr) -declare ptr @llvm.adjust.trampoline(ptr) -declare i64 @f(ptr nest, i64) - -; Most common case -define i64 @test0(i64 %n, ptr %p) nounwind { -; RV64-LABEL: test0: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -48 -; RV64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: lui a0, %hi(f) -; RV64-NEXT: addi a0, a0, %lo(f) -; RV64-NEXT: sd a0, 24(sp) -; RV64-NEXT: li a0, 919 -; RV64-NEXT: lui a2, %hi(.LCPI0_0) -; RV64-NEXT: ld a2, %lo(.LCPI0_0)(a2) -; RV64-NEXT: lui a3, 6203 -; RV64-NEXT: addi a3, a3, 643 -; RV64-NEXT: sw a0, 0(sp) -; RV64-NEXT: sw a3, 4(sp) -; RV64-NEXT: sd a2, 8(sp) -; RV64-NEXT: sd a1, 16(sp) -; RV64-NEXT: addi a1, sp, 32 -; RV64-NEXT: mv a0, sp -; RV64-NEXT: mv s1, sp -; RV64-NEXT: li a2, 0 -; RV64-NEXT: call __riscv_flush_icache -; RV64-NEXT: mv a0, s0 -; RV64-NEXT: jalr s1 -; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 48 -; RV64-NEXT: ret - %alloca = alloca [10 x i8], align 16 - call void @llvm.init.trampoline(ptr %alloca, ptr @f, ptr %p) - %tramp = call ptr @llvm.adjust.trampoline(ptr %alloca) - %ret = call i64 %tramp(i64 %n) - ret i64 %ret - -} - -; RV64: .section ".note.GNU-stack","x",@progbits diff --git a/llvm/test/CodeGen/RISCV/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/RISCV/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..de87b10d3873388279e6a359cb29af541e8e9b83 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/naked-fn-with-frame-pointer.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple riscv32 | FileCheck %s -check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple riscv64 | FileCheck %s -check-prefixes=CHECK-64 + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-LABEL: naked: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: call main +; +; CHECK-64-LABEL: naked: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: call main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-LABEL: normal: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: addi sp, sp, -16 +; CHECK-32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; CHECK-32-NEXT: .cfi_offset ra, -4 +; CHECK-32-NEXT: .cfi_offset s0, -8 +; CHECK-32-NEXT: addi s0, sp, 16 +; CHECK-32-NEXT: .cfi_def_cfa s0, 0 +; CHECK-32-NEXT: call main +; +; CHECK-64-LABEL: normal: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: addi sp, sp, -16 +; CHECK-64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; CHECK-64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill +; CHECK-64-NEXT: .cfi_offset ra, -8 +; CHECK-64-NEXT: .cfi_offset s0, -16 +; CHECK-64-NEXT: addi s0, sp, 16 +; CHECK-64-NEXT: .cfi_def_cfa s0, 0 +; CHECK-64-NEXT: call main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll new file mode 100644 index 0000000000000000000000000000000000000000..ba1840632650985dd84f7779b4080335821b48a3 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64 %s +; RUN: llc -mtriple=riscv64-unknown-linux-gnu -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64-LINUX %s + +declare void @llvm.init.trampoline(ptr, ptr, ptr) +declare ptr @llvm.adjust.trampoline(ptr) +declare i64 @f(ptr nest, i64) + +define i64 @test0(i64 %n, ptr %p) nounwind { +; RV64-LABEL: test0: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -64 +; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: lui a0, %hi(f) +; RV64-NEXT: addi a0, a0, %lo(f) +; RV64-NEXT: sd a0, 32(sp) +; RV64-NEXT: li a0, 919 +; RV64-NEXT: lui a2, %hi(.LCPI0_0) +; RV64-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64-NEXT: lui a3, 6203 +; RV64-NEXT: addi a3, a3, 643 +; RV64-NEXT: sw a0, 8(sp) +; RV64-NEXT: sw a3, 12(sp) +; RV64-NEXT: sd a2, 16(sp) +; RV64-NEXT: sd a1, 24(sp) +; RV64-NEXT: addi a1, sp, 24 +; RV64-NEXT: addi a0, sp, 8 +; RV64-NEXT: addi s1, sp, 8 +; RV64-NEXT: call __clear_cache +; RV64-NEXT: mv a0, s0 +; RV64-NEXT: jalr s1 +; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 64 +; RV64-NEXT: ret +; +; RV64-LINUX-LABEL: test0: +; RV64-LINUX: # %bb.0: +; RV64-LINUX-NEXT: addi sp, sp, -64 +; RV64-LINUX-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; RV64-LINUX-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64-LINUX-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; RV64-LINUX-NEXT: mv s0, a0 +; RV64-LINUX-NEXT: lui a0, %hi(f) +; RV64-LINUX-NEXT: addi a0, a0, %lo(f) +; RV64-LINUX-NEXT: sd a0, 32(sp) +; RV64-LINUX-NEXT: li a0, 919 +; RV64-LINUX-NEXT: lui a2, %hi(.LCPI0_0) +; RV64-LINUX-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64-LINUX-NEXT: lui a3, 6203 +; RV64-LINUX-NEXT: addi a3, a3, 643 +; RV64-LINUX-NEXT: sw a0, 8(sp) +; RV64-LINUX-NEXT: sw a3, 12(sp) +; RV64-LINUX-NEXT: sd a2, 16(sp) +; RV64-LINUX-NEXT: sd a1, 24(sp) +; RV64-LINUX-NEXT: addi a1, sp, 24 +; RV64-LINUX-NEXT: addi a0, sp, 8 +; RV64-LINUX-NEXT: addi s1, sp, 8 +; RV64-LINUX-NEXT: li a2, 0 +; RV64-LINUX-NEXT: call __riscv_flush_icache +; RV64-LINUX-NEXT: mv a0, s0 +; RV64-LINUX-NEXT: jalr s1 +; RV64-LINUX-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; RV64-LINUX-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; RV64-LINUX-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; RV64-LINUX-NEXT: addi sp, sp, 64 +; RV64-LINUX-NEXT: ret + %alloca = alloca [32 x i8], align 8 + call void @llvm.init.trampoline(ptr %alloca, ptr @f, ptr %p) + %tramp = call ptr @llvm.adjust.trampoline(ptr %alloca) + %ret = call i64 %tramp(i64 %n) + ret i64 %ret + +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 7ecf8af54c8dc0966aaff36e79b7848eede8ed63..c24ade1e6d8eff93e5a17ee13cdd78590c99ef8f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -1,8 +1,52 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN + + +define void @fadd_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fadd_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = fadd <8 x bfloat> %a, %b + store <8 x bfloat> %c, ptr %x + ret void +} + +define void @fadd_v6bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fadd_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = fadd <6 x bfloat> %a, %b + store <6 x bfloat> %c, ptr %x + ret void +} define void @fadd_v8f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fadd_v8f16: @@ -97,6 +141,49 @@ define void @fadd_v2f64(ptr %x, ptr %y) { ret void } +define void @fsub_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fsub_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = fsub <8 x bfloat> %a, %b + store <8 x bfloat> %c, ptr %x + ret void +} + +define void @fsub_v6bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fsub_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = fsub <6 x bfloat> %a, %b + store <6 x bfloat> %c, ptr %x + ret void +} + define void @fsub_v8f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fsub_v8f16: ; ZVFH: # %bb.0: @@ -190,6 +277,49 @@ define void @fsub_v2f64(ptr %x, ptr %y) { ret void } +define void @fmul_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fmul_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = fmul <8 x bfloat> %a, %b + store <8 x bfloat> %c, ptr %x + ret void +} + +define void @fmul_v6bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fmul_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = fmul <6 x bfloat> %a, %b + store <6 x bfloat> %c, ptr %x + ret void +} + define void @fmul_v8f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fmul_v8f16: ; ZVFH: # %bb.0: @@ -283,6 +413,49 @@ define void @fmul_v2f64(ptr %x, ptr %y) { ret void } +define void @fdiv_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fdiv_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = fdiv <8 x bfloat> %a, %b + store <8 x bfloat> %c, ptr %x + ret void +} + +define void @fdiv_v6bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fdiv_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = fdiv <6 x bfloat> %a, %b + store <6 x bfloat> %c, ptr %x + ret void +} + define void @fdiv_v8f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fdiv_v8f16: ; ZVFH: # %bb.0: @@ -376,6 +549,36 @@ define void @fdiv_v2f64(ptr %x, ptr %y) { ret void } +define void @fneg_v8bf16(ptr %x) { +; CHECK-LABEL: fneg_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = fneg <8 x bfloat> %a + store <8 x bfloat> %b, ptr %x + ret void +} + +define void @fneg_v6bf16(ptr %x) { +; CHECK-LABEL: fneg_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = fneg <6 x bfloat> %a + store <6 x bfloat> %b, ptr %x + ret void +} + define void @fneg_v8f16(ptr %x) { ; ZVFH-LABEL: fneg_v8f16: ; ZVFH: # %bb.0: @@ -450,6 +653,38 @@ define void @fneg_v2f64(ptr %x) { ret void } +define void @fabs_v8bf16(ptr %x) { +; CHECK-LABEL: fabs_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} + +define void @fabs_v6bf16(ptr %x) { +; CHECK-LABEL: fabs_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = call <6 x bfloat> @llvm.fabs.v6bf16(<6 x bfloat> %a) + store <6 x bfloat> %b, ptr %x + ret void +} + define void @fabs_v8f16(ptr %x) { ; ZVFH-LABEL: fabs_v8f16: ; ZVFH: # %bb.0: @@ -473,7 +708,6 @@ define void @fabs_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.fabs.v8f16(<8 x half>) define void @fabs_v6f16(ptr %x) { ; ZVFH-LABEL: fabs_v6f16: @@ -498,7 +732,6 @@ define void @fabs_v6f16(ptr %x) { store <6 x half> %b, ptr %x ret void } -declare <6 x half> @llvm.fabs.v6f16(<6 x half>) define void @fabs_v4f32(ptr %x) { ; CHECK-LABEL: fabs_v4f32: @@ -513,7 +746,6 @@ define void @fabs_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.fabs.v4f32(<4 x float>) define void @fabs_v2f64(ptr %x) { ; CHECK-LABEL: fabs_v2f64: @@ -528,7 +760,48 @@ define void @fabs_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.fabs.v2f64(<2 x double>) + +define void @copysign_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: copysign_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) + store <8 x bfloat> %c, ptr %x + ret void +} + +define void @copysign_v6bf16(ptr %x, ptr %y) { +; CHECK-LABEL: copysign_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = call <6 x bfloat> @llvm.copysign.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b) + store <6 x bfloat> %c, ptr %x + ret void +} define void @copysign_v8f16(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_v8f16: @@ -558,7 +831,6 @@ define void @copysign_v8f16(ptr %x, ptr %y) { store <8 x half> %c, ptr %x ret void } -declare <8 x half> @llvm.copysign.v8f16(<8 x half>, <8 x half>) define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_v6f16: @@ -590,7 +862,6 @@ define void @copysign_v6f16(ptr %x, ptr %y) { store <6 x half> %c, ptr %x ret void } -declare <6 x half> @llvm.copysign.v6f16(<6 x half>, <6 x half>) define void @copysign_v4f32(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_v4f32: @@ -607,7 +878,6 @@ define void @copysign_v4f32(ptr %x, ptr %y) { store <4 x float> %c, ptr %x ret void } -declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) define void @copysign_v2f64(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_v2f64: @@ -624,7 +894,52 @@ define void @copysign_v2f64(ptr %x, ptr %y) { store <2 x double> %c, ptr %x ret void } -declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) + +define void @copysign_vf_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: copysign_vf_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %c) + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @copysign_vf_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: copysign_vf_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = call <6 x bfloat> @llvm.copysign.v6bf16(<6 x bfloat> %a, <6 x bfloat> %c) + store <6 x bfloat> %d, ptr %x + ret void +} define void @copysign_vf_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: copysign_vf_v8f16: @@ -720,6 +1035,52 @@ define void @copysign_vf_v2f64(ptr %x, double %y) { ret void } +define void @copysign_neg_v8bf16(ptr %x, ptr %y) { +; CHECK-LABEL: copysign_neg_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vand.vx v9, v9, a2 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = fneg <8 x bfloat> %b + %d = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %c) + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @copysign_neg_v6bf16(ptr %x, ptr %y) { +; CHECK-LABEL: copysign_neg_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vand.vx v9, v9, a2 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = fneg <6 x bfloat> %b + %d = call <6 x bfloat> @llvm.copysign.v6bf16(<6 x bfloat> %a, <6 x bfloat> %c) + store <6 x bfloat> %d, ptr %x + ret void +} + define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_neg_v8f16: ; ZVFH: # %bb.0: @@ -818,6 +1179,56 @@ define void @copysign_neg_v2f64(ptr %x, ptr %y) { ret void } +define void @copysign_neg_trunc_v4bf16_v4f32(ptr %x, ptr %y) { +; CHECK-LABEL: copysign_neg_trunc_v4bf16_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v9 +; CHECK-NEXT: vxor.vx v9, v10, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <4 x bfloat>, ptr %x + %b = load <4 x float>, ptr %y + %c = fneg <4 x float> %b + %d = fptrunc <4 x float> %c to <4 x bfloat> + %e = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %d) + store <4 x bfloat> %e, ptr %x + ret void +} + +define void @copysign_neg_trunc_v3bf16_v3f32(ptr %x, ptr %y) { +; CHECK-LABEL: copysign_neg_trunc_v3bf16_v3f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a2, a1, -1 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v9 +; CHECK-NEXT: vxor.vx v9, v10, a1 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, ta, ma +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <3 x bfloat>, ptr %x + %b = load <3 x float>, ptr %y + %c = fneg <3 x float> %b + %d = fptrunc <3 x float> %c to <3 x bfloat> + %e = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %a, <3 x bfloat> %d) + store <3 x bfloat> %e, ptr %x + ret void +} + define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_neg_trunc_v4f16_v4f32: ; ZVFH: # %bb.0: @@ -851,7 +1262,6 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { store <4 x half> %e, ptr %x ret void } -declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_neg_trunc_v3f16_v3f32: @@ -890,7 +1300,6 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { store <3 x half> %e, ptr %x ret void } -declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) define void @copysign_neg_ext_v2f64_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_neg_ext_v2f64_v2f32: @@ -912,6 +1321,43 @@ define void @copysign_neg_ext_v2f64_v2f32(ptr %x, ptr %y) { ret void } +define void @sqrt_v8bf16(ptr %x) { +; CHECK-LABEL: sqrt_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsqrt.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} + +define void @sqrt_v6bf16(ptr %x) { +; CHECK-LABEL: sqrt_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsqrt.v v8, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = call <6 x bfloat> @llvm.sqrt.v6bf16(<6 x bfloat> %a) + store <6 x bfloat> %b, ptr %x + ret void +} + define void @sqrt_v8f16(ptr %x) { ; ZVFH-LABEL: sqrt_v8f16: ; ZVFH: # %bb.0: @@ -937,7 +1383,6 @@ define void @sqrt_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.sqrt.v8f16(<8 x half>) define void @sqrt_v6f16(ptr %x) { ; ZVFH-LABEL: sqrt_v6f16: @@ -965,7 +1410,6 @@ define void @sqrt_v6f16(ptr %x) { store <6 x half> %b, ptr %x ret void } -declare <6 x half> @llvm.sqrt.v6f16(<6 x half>) define void @sqrt_v4f32(ptr %x) { ; CHECK-LABEL: sqrt_v4f32: @@ -980,7 +1424,6 @@ define void @sqrt_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) define void @sqrt_v2f64(ptr %x) { ; CHECK-LABEL: sqrt_v2f64: @@ -995,7 +1438,55 @@ define void @sqrt_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) + +define void @fma_v8bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fma_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = load <8 x bfloat>, ptr %z + %d = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fma_v6bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fma_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = load <6 x bfloat>, ptr %z + %d = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %c) + store <6 x bfloat> %d, ptr %x + ret void +} define void @fma_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fma_v8f16: @@ -1030,7 +1521,6 @@ define void @fma_v8f16(ptr %x, ptr %y, ptr %z) { store <8 x half> %d, ptr %x ret void } -declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) define void @fma_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fma_v6f16: @@ -1066,7 +1556,6 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) { store <6 x half> %d, ptr %x ret void } -declare <6 x half> @llvm.fma.v6f16(<6 x half>, <6 x half>, <6 x half>) define void @fma_v4f32(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fma_v4f32: @@ -1085,7 +1574,6 @@ define void @fma_v4f32(ptr %x, ptr %y, ptr %z) { store <4 x float> %d, ptr %x ret void } -declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) define void @fma_v2f64(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fma_v2f64: @@ -1104,7 +1592,61 @@ define void @fma_v2f64(ptr %x, ptr %y, ptr %z) { store <2 x double> %d, ptr %x ret void } -declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fmsub_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = load <8 x bfloat>, ptr %z + %neg = fneg <8 x bfloat> %c + %d = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %neg) + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fmsub_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = load <6 x bfloat>, ptr %z + %neg = fneg <6 x bfloat> %c + %d = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %neg) + store <6 x bfloat> %d, ptr %x + ret void +} define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fmsub_v8f16: @@ -1220,6 +1762,27 @@ define void @fnmadd_v2f64(ptr %x, ptr %y, ptr %z) { ret void } +define void @fadd_v16bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fadd_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v8 +; CHECK-NEXT: vse16.v v12, (a0) +; CHECK-NEXT: ret + %a = load <16 x bfloat>, ptr %x + %b = load <16 x bfloat>, ptr %y + %c = fadd <16 x bfloat> %a, %b + store <16 x bfloat> %c, ptr %x + ret void +} + define void @fadd_v16f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fadd_v16f16: ; ZVFH: # %bb.0: @@ -1282,6 +1845,27 @@ define void @fadd_v4f64(ptr %x, ptr %y) { ret void } +define void @fsub_v16bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fsub_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v8 +; CHECK-NEXT: vse16.v v12, (a0) +; CHECK-NEXT: ret + %a = load <16 x bfloat>, ptr %x + %b = load <16 x bfloat>, ptr %y + %c = fsub <16 x bfloat> %a, %b + store <16 x bfloat> %c, ptr %x + ret void +} + define void @fsub_v16f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fsub_v16f16: ; ZVFH: # %bb.0: @@ -1344,6 +1928,27 @@ define void @fsub_v4f64(ptr %x, ptr %y) { ret void } +define void @fmul_v16bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fmul_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v8, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v8 +; CHECK-NEXT: vse16.v v12, (a0) +; CHECK-NEXT: ret + %a = load <16 x bfloat>, ptr %x + %b = load <16 x bfloat>, ptr %y + %c = fmul <16 x bfloat> %a, %b + store <16 x bfloat> %c, ptr %x + ret void +} + define void @fmul_v16f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fmul_v16f16: ; ZVFH: # %bb.0: @@ -1406,6 +2011,27 @@ define void @fmul_v4f64(ptr %x, ptr %y) { ret void } +define void @fdiv_v16bf16(ptr %x, ptr %y) { +; CHECK-LABEL: fdiv_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v8, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v8 +; CHECK-NEXT: vse16.v v12, (a0) +; CHECK-NEXT: ret + %a = load <16 x bfloat>, ptr %x + %b = load <16 x bfloat>, ptr %y + %c = fdiv <16 x bfloat> %a, %b + store <16 x bfloat> %c, ptr %x + ret void +} + define void @fdiv_v16f16(ptr %x, ptr %y) { ; ZVFH-LABEL: fdiv_v16f16: ; ZVFH: # %bb.0: @@ -1468,6 +2094,21 @@ define void @fdiv_v4f64(ptr %x, ptr %y) { ret void } +define void @fneg_v16bf16(ptr %x) { +; CHECK-LABEL: fneg_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <16 x bfloat>, ptr %x + %b = fneg <16 x bfloat> %a + store <16 x bfloat> %b, ptr %x + ret void +} + define void @fneg_v16f16(ptr %x) { ; ZVFH-LABEL: fneg_v16f16: ; ZVFH: # %bb.0: @@ -1519,6 +2160,30 @@ define void @fneg_v4f64(ptr %x) { ret void } +define void @fma_v16bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fma_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v20, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v8 +; CHECK-NEXT: vse16.v v12, (a0) +; CHECK-NEXT: ret + %a = load <16 x bfloat>, ptr %x + %b = load <16 x bfloat>, ptr %y + %c = load <16 x bfloat>, ptr %z + %d = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) + store <16 x bfloat> %d, ptr %x + ret void +} + define void @fma_v16f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fma_v16f16: ; ZVFH: # %bb.0: @@ -1552,7 +2217,6 @@ define void @fma_v16f16(ptr %x, ptr %y, ptr %z) { store <16 x half> %d, ptr %x ret void } -declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) define void @fma_v8f32(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fma_v8f32: @@ -1571,7 +2235,6 @@ define void @fma_v8f32(ptr %x, ptr %y, ptr %z) { store <8 x float> %d, ptr %x ret void } -declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) define void @fma_v4f64(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fma_v4f64: @@ -1590,7 +2253,53 @@ define void @fma_v4f64(ptr %x, ptr %y, ptr %z) { store <4 x double> %d, ptr %x ret void } -declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) + +define void @fadd_vf_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fadd_vf_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fadd <8 x bfloat> %a, %c + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fadd_vf_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fadd_vf_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fadd <6 x bfloat> %a, %c + store <6 x bfloat> %d, ptr %x + ret void +} define void @fadd_vf_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fadd_vf_v8f16: @@ -1687,6 +2396,53 @@ define void @fadd_vf_v2f64(ptr %x, double %y) { ret void } +define void @fadd_fv_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fadd_fv_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fadd <8 x bfloat> %c, %a + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fadd_fv_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fadd_fv_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fadd <6 x bfloat> %c, %a + store <6 x bfloat> %d, ptr %x + ret void +} + define void @fadd_fv_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fadd_fv_v8f16: ; ZVFH: # %bb.0: @@ -1782,6 +2538,53 @@ define void @fadd_fv_v2f64(ptr %x, double %y) { ret void } +define void @fsub_vf_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fsub_vf_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fsub <8 x bfloat> %a, %c + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fsub_vf_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fsub_vf_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fsub <6 x bfloat> %a, %c + store <6 x bfloat> %d, ptr %x + ret void +} + define void @fsub_vf_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fsub_vf_v8f16: ; ZVFH: # %bb.0: @@ -1877,6 +2680,53 @@ define void @fsub_vf_v2f64(ptr %x, double %y) { ret void } +define void @fsub_fv_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fsub_fv_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fsub <8 x bfloat> %c, %a + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fsub_fv_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fsub_fv_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fsub <6 x bfloat> %c, %a + store <6 x bfloat> %d, ptr %x + ret void +} + define void @fsub_fv_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fsub_fv_v8f16: ; ZVFH: # %bb.0: @@ -1972,6 +2822,53 @@ define void @fsub_fv_v2f64(ptr %x, double %y) { ret void } +define void @fmul_vf_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fmul_vf_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fmul <8 x bfloat> %a, %c + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fmul_vf_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fmul_vf_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fmul <6 x bfloat> %a, %c + store <6 x bfloat> %d, ptr %x + ret void +} + define void @fmul_vf_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fmul_vf_v8f16: ; ZVFH: # %bb.0: @@ -2067,6 +2964,53 @@ define void @fmul_vf_v2f64(ptr %x, double %y) { ret void } +define void @fmul_fv_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fmul_fv_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fmul <8 x bfloat> %c, %a + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fmul_fv_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fmul_fv_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fmul <6 x bfloat> %c, %a + store <6 x bfloat> %d, ptr %x + ret void +} + define void @fmul_fv_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fmul_fv_v8f16: ; ZVFH: # %bb.0: @@ -2162,6 +3106,53 @@ define void @fmul_fv_v2f64(ptr %x, double %y) { ret void } +define void @fdiv_vf_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fdiv_vf_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fdiv <8 x bfloat> %a, %c + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fdiv_vf_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fdiv_vf_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v8, v10, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fdiv <6 x bfloat> %a, %c + store <6 x bfloat> %d, ptr %x + ret void +} + define void @fdiv_vf_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fdiv_vf_v8f16: ; ZVFH: # %bb.0: @@ -2257,6 +3248,53 @@ define void @fdiv_vf_v2f64(ptr %x, double %y) { ret void } +define void @fdiv_fv_v8bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fdiv_fv_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v8, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer + %d = fdiv <8 x bfloat> %c, %a + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fdiv_fv_v6bf16(ptr %x, bfloat %y) { +; CHECK-LABEL: fdiv_fv_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v8, v12, v10 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0 + %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer + %d = fdiv <6 x bfloat> %c, %a + store <6 x bfloat> %d, ptr %x + ret void +} + define void @fdiv_fv_v8f16(ptr %x, half %y) { ; ZVFH-LABEL: fdiv_fv_v8f16: ; ZVFH: # %bb.0: @@ -2352,6 +3390,59 @@ define void @fdiv_fv_v2f64(ptr %x, double %y) { ret void } +define void @fma_vf_v8bf16(ptr %x, ptr %y, bfloat %z) { +; CHECK-LABEL: fma_vf_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = insertelement <8 x bfloat> poison, bfloat %z, i32 0 + %d = shufflevector <8 x bfloat> %c, <8 x bfloat> poison, <8 x i32> zeroinitializer + %e = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %d, <8 x bfloat> %b) + store <8 x bfloat> %e, ptr %x + ret void +} + +define void @fma_vf_v6bf16(ptr %x, ptr %y, bfloat %z) { +; CHECK-LABEL: fma_vf_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = insertelement <6 x bfloat> poison, bfloat %z, i32 0 + %d = shufflevector <6 x bfloat> %c, <6 x bfloat> poison, <6 x i32> zeroinitializer + %e = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %d, <6 x bfloat> %b) + store <6 x bfloat> %e, ptr %x + ret void +} + define void @fma_vf_v8f16(ptr %x, ptr %y, half %z) { ; ZVFH-LABEL: fma_vf_v8f16: ; ZVFH: # %bb.0: @@ -2459,6 +3550,59 @@ define void @fma_vf_v2f64(ptr %x, ptr %y, double %z) { ret void } +define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) { +; CHECK-LABEL: fma_fv_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = insertelement <8 x bfloat> poison, bfloat %z, i32 0 + %d = shufflevector <8 x bfloat> %c, <8 x bfloat> poison, <8 x i32> zeroinitializer + %e = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %d, <8 x bfloat> %a, <8 x bfloat> %b) + store <8 x bfloat> %e, ptr %x + ret void +} + +define void @fma_fv_v6bf16(ptr %x, ptr %y, bfloat %z) { +; CHECK-LABEL: fma_fv_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: fmv.x.w a1, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = insertelement <6 x bfloat> poison, bfloat %z, i32 0 + %d = shufflevector <6 x bfloat> %c, <6 x bfloat> poison, <6 x i32> zeroinitializer + %e = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %d, <6 x bfloat> %a, <6 x bfloat> %b) + store <6 x bfloat> %e, ptr %x + ret void +} + define void @fma_fv_v8f16(ptr %x, ptr %y, half %z) { ; ZVFH-LABEL: fma_fv_v8f16: ; ZVFH: # %bb.0: @@ -2566,6 +3710,65 @@ define void @fma_fv_v2f64(ptr %x, ptr %y, double %z) { ret void } +define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) { +; CHECK-LABEL: fmsub_vf_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a2, fa0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vmv.v.x v10, a2 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = insertelement <8 x bfloat> poison, bfloat %z, i32 0 + %d = shufflevector <8 x bfloat> %c, <8 x bfloat> poison, <8 x i32> zeroinitializer + %neg = fneg <8 x bfloat> %b + %e = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %d, <8 x bfloat> %neg) + store <8 x bfloat> %e, ptr %x + ret void +} + +define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) { +; CHECK-LABEL: fmsub_vf_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a2, fa0 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a2 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = insertelement <6 x bfloat> poison, bfloat %z, i32 0 + %d = shufflevector <6 x bfloat> %c, <6 x bfloat> poison, <6 x i32> zeroinitializer + %neg = fneg <6 x bfloat> %b + %e = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %d, <6 x bfloat> %neg) + store <6 x bfloat> %e, ptr %x + ret void +} + define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) { ; ZVFH-LABEL: fmsub_vf_v8f16: ; ZVFH: # %bb.0: @@ -2721,13 +3924,64 @@ define void @fnmadd_fv_v2f64(ptr %x, ptr %y, double %z) { ret void } +define void @trunc_v8bf16(ptr %x) { +; CHECK-LABEL: trunc_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.trunc.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} + +define void @trunc_v6bf16(ptr %x) { +; CHECK-LABEL: trunc_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = call <6 x bfloat> @llvm.trunc.v6bf16(<6 x bfloat> %a) + store <6 x bfloat> %b, ptr %x + ret void +} + define void @trunc_v8f16(ptr %x) { ; ZVFH-LABEL: trunc_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI115_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI115_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI171_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI171_0)(a1) ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -2760,15 +4014,14 @@ define void @trunc_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.trunc.v8f16(<8 x half>) define void @trunc_v6f16(ptr %x) { ; ZVFH-LABEL: trunc_v6f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI116_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI116_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI172_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI172_0)(a1) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -2803,7 +4056,6 @@ define void @trunc_v6f16(ptr %x) { store <6 x half> %b, ptr %x ret void } -declare <6 x half> @llvm.trunc.v6f16(<6 x half>) define void @trunc_v4f32(ptr %x) { ; CHECK-LABEL: trunc_v4f32: @@ -2825,15 +4077,14 @@ define void @trunc_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.trunc.v4f32(<4 x float>) define void @trunc_v2f64(ptr %x) { ; CHECK-LABEL: trunc_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI118_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI118_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI174_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI174_0)(a1) ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -2847,15 +4098,69 @@ define void @trunc_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.trunc.v2f64(<2 x double>) + +define void @ceil_v8bf16(ptr %x) { +; CHECK-LABEL: ceil_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a1, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.ceil.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} + +define void @ceil_v6bf16(ptr %x) { +; CHECK-LABEL: ceil_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a1, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = call <6 x bfloat> @llvm.ceil.v6bf16(<6 x bfloat> %a) + store <6 x bfloat> %b, ptr %x + ret void +} define void @ceil_v8f16(ptr %x) { ; ZVFH-LABEL: ceil_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI119_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI119_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI177_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI177_0)(a1) ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 3 @@ -2892,15 +4197,14 @@ define void @ceil_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.ceil.v8f16(<8 x half>) define void @ceil_v6f16(ptr %x) { ; ZVFH-LABEL: ceil_v6f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI120_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI120_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI178_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI178_0)(a1) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -2939,7 +4243,6 @@ define void @ceil_v6f16(ptr %x) { store <6 x half> %b, ptr %x ret void } -declare <6 x half> @llvm.ceil.v6f16(<6 x half>) define void @ceil_v4f32(ptr %x) { ; CHECK-LABEL: ceil_v4f32: @@ -2963,15 +4266,14 @@ define void @ceil_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.ceil.v4f32(<4 x float>) define void @ceil_v2f64(ptr %x) { ; CHECK-LABEL: ceil_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI122_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI122_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI180_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI180_0)(a1) ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a1, 3 @@ -2987,15 +4289,69 @@ define void @ceil_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.ceil.v2f64(<2 x double>) + +define void @floor_v8bf16(ptr %x) { +; CHECK-LABEL: floor_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a1, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.floor.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} + +define void @floor_v6bf16(ptr %x) { +; CHECK-LABEL: floor_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a1, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = call <6 x bfloat> @llvm.floor.v6bf16(<6 x bfloat> %a) + store <6 x bfloat> %b, ptr %x + ret void +} define void @floor_v8f16(ptr %x) { ; ZVFH-LABEL: floor_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI123_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI123_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI183_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI183_0)(a1) ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 2 @@ -3032,15 +4388,14 @@ define void @floor_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.floor.v8f16(<8 x half>) define void @floor_v6f16(ptr %x) { ; ZVFH-LABEL: floor_v6f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI124_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI124_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI184_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI184_0)(a1) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -3079,7 +4434,6 @@ define void @floor_v6f16(ptr %x) { store <6 x half> %b, ptr %x ret void } -declare <6 x half> @llvm.floor.v6f16(<6 x half>) define void @floor_v4f32(ptr %x) { ; CHECK-LABEL: floor_v4f32: @@ -3103,15 +4457,14 @@ define void @floor_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.floor.v4f32(<4 x float>) define void @floor_v2f64(ptr %x) { ; CHECK-LABEL: floor_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI126_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI126_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI186_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI186_0)(a1) ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a1, 2 @@ -3127,15 +4480,69 @@ define void @floor_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.floor.v2f64(<2 x double>) + +define void @round_v8bf16(ptr %x) { +; CHECK-LABEL: round_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a1, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.round.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} + +define void @round_v6bf16(ptr %x) { +; CHECK-LABEL: round_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a1, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = call <6 x bfloat> @llvm.round.v6bf16(<6 x bfloat> %a) + store <6 x bfloat> %b, ptr %x + ret void +} define void @round_v8f16(ptr %x) { ; ZVFH-LABEL: round_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI127_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI127_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI189_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI189_0)(a1) ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 4 @@ -3172,15 +4579,14 @@ define void @round_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.round.v8f16(<8 x half>) define void @round_v6f16(ptr %x) { ; ZVFH-LABEL: round_v6f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI128_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI128_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI190_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI190_0)(a1) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -3219,7 +4625,6 @@ define void @round_v6f16(ptr %x) { store <6 x half> %b, ptr %x ret void } -declare <6 x half> @llvm.round.v6f16(<6 x half>) define void @round_v4f32(ptr %x) { ; CHECK-LABEL: round_v4f32: @@ -3243,15 +4648,14 @@ define void @round_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.round.v4f32(<4 x float>) define void @round_v2f64(ptr %x) { ; CHECK-LABEL: round_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI130_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI130_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI192_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI192_0)(a1) ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a1, 4 @@ -3267,15 +4671,39 @@ define void @round_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.round.v2f64(<2 x double>) + +define void @rint_v8bf16(ptr %x) { +; CHECK-LABEL: rint_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.rint.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} define void @rint_v8f16(ptr %x) { ; ZVFH-LABEL: rint_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI131_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI131_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI194_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI194_0)(a1) ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -3308,7 +4736,6 @@ define void @rint_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.rint.v8f16(<8 x half>) define void @rint_v4f32(ptr %x) { ; CHECK-LABEL: rint_v4f32: @@ -3330,15 +4757,14 @@ define void @rint_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.rint.v4f32(<4 x float>) define void @rint_v2f64(ptr %x) { ; CHECK-LABEL: rint_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI133_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI133_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI196_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI196_0)(a1) ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -3352,15 +4778,41 @@ define void @rint_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.rint.v2f64(<2 x double>) + +define void @nearbyint_v8bf16(ptr %x) { +; CHECK-LABEL: nearbyint_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a1, 307200 +; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a1 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = call <8 x bfloat> @llvm.nearbyint.v8bf16(<8 x bfloat> %a) + store <8 x bfloat> %b, ptr %x + ret void +} define void @nearbyint_v8f16(ptr %x) { ; ZVFH-LABEL: nearbyint_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI134_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI134_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI198_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI198_0)(a1) ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a1 @@ -3397,7 +4849,6 @@ define void @nearbyint_v8f16(ptr %x) { store <8 x half> %b, ptr %x ret void } -declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>) define void @nearbyint_v4f32(ptr %x) { ; CHECK-LABEL: nearbyint_v4f32: @@ -3421,15 +4872,14 @@ define void @nearbyint_v4f32(ptr %x) { store <4 x float> %b, ptr %x ret void } -declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) define void @nearbyint_v2f64(ptr %x) { ; CHECK-LABEL: nearbyint_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI136_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI136_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI200_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI200_0)(a1) ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a1 @@ -3445,7 +4895,65 @@ define void @nearbyint_v2f64(ptr %x) { store <2 x double> %b, ptr %x ret void } -declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) + +define void @fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fmuladd_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v11 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = load <8 x bfloat>, ptr %z + %d = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fmuladd_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v11 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = load <6 x bfloat>, ptr %z + %d = call <6 x bfloat> @llvm.fmuladd.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %c) + store <6 x bfloat> %d, ptr %x + ret void +} define void @fmuladd_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fmuladd_v8f16: @@ -3485,7 +4993,6 @@ define void @fmuladd_v8f16(ptr %x, ptr %y, ptr %z) { store <8 x half> %d, ptr %x ret void } -declare <8 x half> @llvm.fmuladd.v8f16(<8 x half>, <8 x half>, <8 x half>) define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fmuladd_v6f16: @@ -3526,7 +5033,6 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { store <6 x half> %d, ptr %x ret void } -declare <6 x half> @llvm.fmuladd.v6f16(<6 x half>, <6 x half>, <6 x half>) define void @fmuladd_v4f32(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmuladd_v4f32: @@ -3545,7 +5051,6 @@ define void @fmuladd_v4f32(ptr %x, ptr %y, ptr %z) { store <4 x float> %d, ptr %x ret void } -declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) define void @fmuladd_v2f64(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmuladd_v2f64: @@ -3564,7 +5069,67 @@ define void @fmuladd_v2f64(ptr %x, ptr %y, ptr %z) { store <2 x double> %d, ptr %x ret void } -declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define void @fmsub_fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fmsub_fmuladd_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v11 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <8 x bfloat>, ptr %x + %b = load <8 x bfloat>, ptr %y + %c = load <8 x bfloat>, ptr %z + %neg = fneg <8 x bfloat> %c + %d = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %neg) + store <8 x bfloat> %d, ptr %x + ret void +} + +define void @fmsub_fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) { +; CHECK-LABEL: fmsub_fmuladd_v6bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v11 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: ret + %a = load <6 x bfloat>, ptr %x + %b = load <6 x bfloat>, ptr %y + %c = load <6 x bfloat>, ptr %z + %neg = fneg <6 x bfloat> %c + %d = call <6 x bfloat> @llvm.fmuladd.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %neg) + store <6 x bfloat> %d, ptr %x + ret void +} define void @fmsub_fmuladd_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fmsub_fmuladd_v8f16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll index 8cb6fed2f588a77293b696552a418460eb16b299..5460caea196cf8daf4fb1a511026a6a45b9326d1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s -; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s ; Tests assume VLEN=128 or vscale_range_min=2. @@ -1533,6 +1535,333 @@ define @splice_nxv8i64_offset_max( %a, %res } +declare @llvm.vector.splice.nxv1bf16(, , i32) + +define @splice_nxv1bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv1bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv1bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 -2) + ret %res +} + +define @splice_nxv1bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 1) + ret %res +} + +declare @llvm.vector.splice.nxv2bf16(, , i32) + +define @splice_nxv2bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv2bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv2bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 -4) + ret %res +} + +define @splice_nxv2bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 3) + ret %res +} + +declare @llvm.vector.splice.nxv4bf16(, , i32) + +define @splice_nxv4bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv4bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv4bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 8 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 -8) + ret %res +} + +define @splice_nxv4bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 7) + ret %res +} + +declare @llvm.vector.splice.nxv8bf16(, , i32) + +define @splice_nxv8bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv8bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv8bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 -16) + ret %res +} + +define @splice_nxv8bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 15) + ret %res +} + +declare @llvm.vector.splice.nxv16bf16(, , i32) + +define @splice_nxv16bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv16bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv16bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 -32) + ret %res +} + +define @splice_nxv16bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 31) + ret %res +} + +declare @llvm.vector.splice.nxv32bf16(, , i32) + +define @splice_nxv32bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv32bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv32bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv32bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv32bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv32bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -64 +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 -64) + ret %res +} + +define @splice_nxv32bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv32bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -63 +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 63) + ret %res +} + declare @llvm.vector.splice.nxv1f16(, , i32) define @splice_nxv1f16_offset_zero( %a, %b) #0 { diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index a360ae1998f77a2680f3cf3916fe51b59d39bf20..11f603b56b6e56a86855010edcb3fea589efc5e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -1122,6 +1122,132 @@ define @vrem_vx( %a, i32 %b, iXLen %vl) { ret %2 } +define @vwmul_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vwmul_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; NOVLOPT-NEXT: vwmul.vv v12, v8, v9 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vwmul.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vwmul_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; VLOPT-NEXT: vwmul.vv v12, v8, v9 +; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; VLOPT-NEXT: vwmul.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32( poison, %a, %b, iXLen -1) + %2 = call @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vwmul_vx( %a, i16 %b, i32 %c, iXLen %vl) { +; NOVLOPT-LABEL: vwmul_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a3, zero, e16, m1, ta, ma +; NOVLOPT-NEXT: vwmul.vx v12, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; NOVLOPT-NEXT: vwmul.vx v8, v12, a1 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vwmul_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; VLOPT-NEXT: vwmul.vx v12, v8, a0 +; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; VLOPT-NEXT: vwmul.vx v8, v12, a1 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vwmul.nxv4i32.nxv4i16.i16( poison, %a, i16 %b, iXLen -1) + %2 = call @llvm.riscv.vwmul.nxv4i64.nxv4i64.i32( poison, %1, i32 %c, iXLen %vl) + ret %2 +} + +define @vwmulsu_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vwmulsu_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vwmulsu.vv v12, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vwmulsu_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vwmulsu.vv v12, v8, v10 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.nxv4i32( poison, %a, %b, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i64.nxv4i64( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vwmulsu_vx( %a, i32 %b, iXLen %vl) { +; NOVLOPT-LABEL: vwmulsu_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vwmulsu.vx v12, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vwmulsu_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vwmulsu.vx v12, v8, a0 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.i32( poison, %a, i32 %b, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i64.nxv4i64( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vwmulu_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vwmulu_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vwmulu.vv v12, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vwmulu_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vwmulu.vv v12, v8, v10 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vwmulu.nxv4i64.nxv4i32.nxv4i32( poison, %a, %b, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i64.nxv4i64( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vwmulu_vx( %a, i32 %b, iXLen %vl) { +; NOVLOPT-LABEL: vwmulu_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vwmulu.vx v12, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vwmulu_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vwmulu.vx v12, v8, a0 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vwmulu.nxv4i64.nxv4i32.i32( poison, %a, i32 %b, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i64.nxv4i64( poison, %1, %1, iXLen %vl) + ret %2 +} + define @vwmacc_vx( %a, i16 %b, iXLen %vl) { ; NOVLOPT-LABEL: vwmacc_vx: ; NOVLOPT: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll index 0b3e67ec8955664a5e5fa244ca730e5157b45747..1a1472fcfc66f5826e9483320da650d901378db3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll @@ -11,19 +11,46 @@ declare @llvm.riscv.vadd.nxv4i32.nxv4i32(, , , iXLen) define @different_imm_vl_with_ta( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { -; CHECK-LABEL: different_imm_vl_with_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; CHECK-NEXT: vadd.vv v8, v10, v12 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: ret +; NOVLOPT-LABEL: different_imm_vl_with_ta: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v12 +; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v8, v10 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: different_imm_vl_with_ta: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; VLOPT-NEXT: vadd.vv v8, v10, v12 +; VLOPT-NEXT: vadd.vv v8, v8, v10 +; VLOPT-NEXT: ret %v = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %a, %b, iXLen 5) %w = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %v, %a, iXLen 4) ret %w } -; No benificial to propagate VL since VL is larger in the use side. +define @vlmax_and_imm_vl_with_ta( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { +; NOVLOPT-LABEL: vlmax_and_imm_vl_with_ta: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v12 +; NOVLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v8, v10 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vlmax_and_imm_vl_with_ta: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; VLOPT-NEXT: vadd.vv v8, v10, v12 +; VLOPT-NEXT: vadd.vv v8, v8, v10 +; VLOPT-NEXT: ret + %v = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %a, %b, iXLen -1) + %w = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %v, %a, iXLen 4) + ret %w +} + +; Not beneficial to propagate VL since VL is larger in the use side. define @different_imm_vl_with_ta_larger_vl( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: different_imm_vl_with_ta_larger_vl: ; CHECK: # %bb.0: @@ -50,8 +77,7 @@ define @different_imm_reg_vl_with_ta( %pass ret %w } - -; No benificial to propagate VL since VL is already one. +; Not beneficial to propagate VL since VL is already one. define @different_imm_vl_with_ta_1( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: different_imm_vl_with_ta_1: ; CHECK: # %bb.0: @@ -110,7 +136,3 @@ define @different_imm_vl_with_tu( %passthru %w = call @llvm.riscv.vadd.nxv4i32.nxv4i32( %passthru, %v, %a,iXLen 4) ret %w } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; NOVLOPT: {{.*}} -; VLOPT: {{.*}} diff --git a/llvm/test/CodeGen/SPARC/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/SPARC/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..af97c573625b52992d49afffa8d3fb6683665f64 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/naked-fn-with-frame-pointer.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple sparc | FileCheck %s -check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple sparc64 | FileCheck %s -check-prefixes=CHECK-64 + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-LABEL: naked: +; CHECK-32: .cfi_startproc +; CHECK-32-NEXT: ! %bb.0: +; CHECK-32-NEXT: call main +; CHECK-32-NEXT: nop +; +; CHECK-64-LABEL: naked: +; CHECK-64: .cfi_startproc +; CHECK-64-NEXT: ! %bb.0: +; CHECK-64-NEXT: call main +; CHECK-64-NEXT: nop + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-LABEL: normal: +; CHECK-32: .cfi_startproc +; CHECK-32-NEXT: ! %bb.0: +; CHECK-32-NEXT: save %sp, -96, %sp +; CHECK-32-NEXT: .cfi_def_cfa_register %fp +; CHECK-32-NEXT: .cfi_window_save +; CHECK-32-NEXT: .cfi_register %o7, %i7 +; CHECK-32-NEXT: call main +; CHECK-32-NEXT: nop +; +; CHECK-64-LABEL: normal: +; CHECK-64: .cfi_startproc +; CHECK-64-NEXT: ! %bb.0: +; CHECK-64-NEXT: save %sp, -176, %sp +; CHECK-64-NEXT: .cfi_def_cfa_register %fp +; CHECK-64-NEXT: .cfi_window_save +; CHECK-64-NEXT: .cfi_register %o7, %i7 +; CHECK-64-NEXT: call main +; CHECK-64-NEXT: nop + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll new file mode 100644 index 0000000000000000000000000000000000000000..8ba17df30c36857d3b9c89e2deae0dd05624ba8a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll @@ -0,0 +1,56 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - -filetype=obj | spirv-val %} + +; Test lowering to spir-v backend for various types and scalar/vector + +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#f32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#v4_float:]] = OpTypeVector %[[#f32]] 4 +; CHECK-DAG: %[[#bool:]] = OpTypeBool +; CHECK-DAG: %[[#v4_bool:]] = OpTypeVector %[[#bool]] 4 +; CHECK-DAG: %[[#scope:]] = OpConstant %[[#uint]] 3 + +; CHECK-LABEL: Begin function test_float +; CHECK: %[[#fexpr:]] = OpFunctionParameter %[[#f32]] +; CHECK: %[[#idx1:]] = OpFunctionParameter %[[#uint]] +define float @test_float(float %fexpr, i32 %idx) { +entry: +; CHECK: %[[#fret:]] = OpGroupNonUniformShuffle %[[#f32]] %[[#scope]] %[[#fexpr]] %[[#idx1]] + %0 = call float @llvm.spv.wave.readlane.f32(float %fexpr, i32 %idx) + ret float %0 +} + +; CHECK-LABEL: Begin function test_int +; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]] +; CHECK: %[[#idx2:]] = OpFunctionParameter %[[#uint]] +define i32 @test_int(i32 %iexpr, i32 %idx) { +entry: +; CHECK: %[[#iret:]] = OpGroupNonUniformShuffle %[[#uint]] %[[#scope]] %[[#iexpr]] %[[#idx2]] + %0 = call i32 @llvm.spv.wave.readlane.i32(i32 %iexpr, i32 %idx) + ret i32 %0 +} + +; CHECK-LABEL: Begin function test_vbool +; CHECK: %[[#vbexpr:]] = OpFunctionParameter %[[#v4_bool]] +; CHECK: %[[#idx3:]] = OpFunctionParameter %[[#uint]] +define <4 x i1> @test_vbool(<4 x i1> %vbexpr, i32 %idx) { +entry: +; CHECK: %[[#vbret:]] = OpGroupNonUniformShuffle %[[#v4_bool]] %[[#scope]] %[[#vbexpr]] %[[#idx3]] + %0 = call <4 x i1> @llvm.spv.wave.readlane.v4i1(<4 x i1> %vbexpr, i32 %idx) + ret <4 x i1> %0 +} + +; CHECK-LABEL: Begin function test_vfloat +; CHECK: %[[#vfexpr:]] = OpFunctionParameter %[[#v4_float]] +; CHECK: %[[#idx4:]] = OpFunctionParameter %[[#uint]] +define <4 x float> @test_vfloat(<4 x float> %vfexpr, i32 %idx) { +entry: +; CHECK: %[[#vbret:]] = OpGroupNonUniformShuffle %[[#v4_float]] %[[#scope]] %[[#vfexpr]] %[[#idx4]] + %0 = call <4 x float> @llvm.spv.wave.readlane.v4f32(<4 x float> %vfexpr, i32 %idx) + ret <4 x float> %0 +} + +declare float @llvm.spv.wave.readlane.f32(float, i32) +declare i32 @llvm.spv.wave.readlane.i32(i32, i32) +declare <4 x i1> @llvm.spv.wave.readlane.v4i1(<4 x i1>, i32) +declare <4 x float> @llvm.spv.wave.readlane.v4f32(<4 x float>, i32) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll index bdbfc133efa29b07c2ae8453a89dc6549ca4c50f..a0306bae4a22dee2f28dba4addc95df5f09e328b 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll @@ -1,49 +1,49 @@ -; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 - -define noundef float @atan2_float(float noundef %a, float noundef %b) { -entry: -; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call float @llvm.atan2.f32(float %a, float %b) - ret float %elt.atan2 -} - -define noundef half @atan2_half(half noundef %a, half noundef %b) { -entry: -; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call half @llvm.atan2.f16(half %a, half %b) - ret half %elt.atan2 -} - -define noundef <4 x float> @atan2_float4(<4 x float> noundef %a, <4 x float> noundef %b) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %a, <4 x float> %b) - ret <4 x float> %elt.atan2 -} - -define noundef <4 x half> @atan2_half4(<4 x half> noundef %a, <4 x half> noundef %b) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %a, <4 x half> %b) - ret <4 x half> %elt.atan2 -} - -declare half @llvm.atan2.f16(half, half) -declare float @llvm.atan2.f32(float, float) -declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>) -declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 + +define noundef float @atan2_float(float noundef %a, float noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call float @llvm.atan2.f32(float %a, float %b) + ret float %elt.atan2 +} + +define noundef half @atan2_half(half noundef %a, half noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call half @llvm.atan2.f16(half %a, half %b) + ret half %elt.atan2 +} + +define noundef <4 x float> @atan2_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %elt.atan2 +} + +define noundef <4 x half> @atan2_half4(<4 x half> noundef %a, <4 x half> noundef %b) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %elt.atan2 +} + +declare half @llvm.atan2.f16(half, half) +declare float @llvm.atan2.f32(float, float) +declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll index 2e0eb8c429ac27049e517e2afcdf8c2519ecbcfc..7c06c14bb968d1d62e0efbf6dae59ced107a9ec7 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll @@ -1,33 +1,33 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for cross are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3 -; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3 - -define noundef <3 x half> @cross_half4(<3 x half> noundef %a, <3 x half> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec3_float_16]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]] - ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] - %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b) - ret <3 x half> %hlsl.cross -} - -define noundef <3 x float> @cross_float4(<3 x float> noundef %a, <3 x float> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec3_float_32]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]] - ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] - %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b) - ret <3 x float> %hlsl.cross -} - -declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>) -declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for cross are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3 +; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3 + +define noundef <3 x half> @cross_half4(<3 x half> noundef %a, <3 x half> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec3_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]] + ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] + %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b) + ret <3 x half> %hlsl.cross +} + +define noundef <3 x float> @cross_float4(<3 x float> noundef %a, <3 x float> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec3_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]] + ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] + %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b) + ret <3 x float> %hlsl.cross +} + +declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>) +declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll index b4a9d8e0664b7e80fdd6b46f149ad6909f987dda..df1ef3a7287c3bd7880f3728926c64c4ee0250f7 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll @@ -1,29 +1,29 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for length are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 - -define noundef half @length_half4(<4 x half> noundef %a) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]] - %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a) - ret half %hlsl.length -} - -define noundef float @length_float4(<4 x float> noundef %a) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]] - %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a) - ret float %hlsl.length -} - -declare half @llvm.spv.length.v4f16(<4 x half>) -declare float @llvm.spv.length.v4f32(<4 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for length are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef half @length_half4(<4 x half> noundef %a) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]] + %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a) + ret half %hlsl.length +} + +define noundef float @length_float4(<4 x float> noundef %a) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]] + %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a) + ret float %hlsl.length +} + +declare half @llvm.spv.length.v4f16(<4 x half>) +declare float @llvm.spv.length.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll index fa73b9c2a4d3abfaca82f0aab595c513d0ea1020..4659b5146e43270b126187f90f3294aba52ed6df 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll @@ -1,31 +1,31 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for normalize are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 - -define noundef <4 x half> @normalize_half4(<4 x half> noundef %a) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Normalize %[[#arg0]] - %hlsl.normalize = call <4 x half> @llvm.spv.normalize.v4f16(<4 x half> %a) - ret <4 x half> %hlsl.normalize -} - -define noundef <4 x float> @normalize_float4(<4 x float> noundef %a) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Normalize %[[#arg0]] - %hlsl.normalize = call <4 x float> @llvm.spv.normalize.v4f32(<4 x float> %a) - ret <4 x float> %hlsl.normalize -} - -declare <4 x half> @llvm.spv.normalize.v4f16(<4 x half>) -declare <4 x float> @llvm.spv.normalize.v4f32(<4 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for normalize are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef <4 x half> @normalize_half4(<4 x half> noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Normalize %[[#arg0]] + %hlsl.normalize = call <4 x half> @llvm.spv.normalize.v4f16(<4 x half> %a) + ret <4 x half> %hlsl.normalize +} + +define noundef <4 x float> @normalize_float4(<4 x float> noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Normalize %[[#arg0]] + %hlsl.normalize = call <4 x float> @llvm.spv.normalize.v4f32(<4 x float> %a) + ret <4 x float> %hlsl.normalize +} + +declare <4 x half> @llvm.spv.normalize.v4f16(<4 x half>) +declare <4 x float> @llvm.spv.normalize.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll index bb50d8c790f8adc23609173699b0df7a597c109f..7c0ee9398d15fc1d26c7d8534624bd6379ac9517 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll @@ -1,33 +1,33 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for step are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 - -define noundef <4 x half> @step_half4(<4 x half> noundef %a, <4 x half> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] - %hlsl.step = call <4 x half> @llvm.spv.step.v4f16(<4 x half> %a, <4 x half> %b) - ret <4 x half> %hlsl.step -} - -define noundef <4 x float> @step_float4(<4 x float> noundef %a, <4 x float> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] - %hlsl.step = call <4 x float> @llvm.spv.step.v4f32(<4 x float> %a, <4 x float> %b) - ret <4 x float> %hlsl.step -} - -declare <4 x half> @llvm.spv.step.v4f16(<4 x half>, <4 x half>) -declare <4 x float> @llvm.spv.step.v4f32(<4 x float>, <4 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for step are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef <4 x half> @step_half4(<4 x half> noundef %a, <4 x half> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] + %hlsl.step = call <4 x half> @llvm.spv.step.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %hlsl.step +} + +define noundef <4 x float> @step_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] + %hlsl.step = call <4 x float> @llvm.spv.step.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %hlsl.step +} + +declare <4 x half> @llvm.spv.step.v4f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.spv.step.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/SystemZ/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/SystemZ/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..3eb396e4044206794e44a97ea103fd48165903fb --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/naked-fn-with-frame-pointer.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple s390x | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: # %bb.0: +; CHECK-NEXT: brasl %r14, main@PLT + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: brasl %r14, main@PLT + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/Thumb/frame-chain.ll b/llvm/test/CodeGen/Thumb/frame-chain.ll index eb62ce09caf1be45c20ad0c2b23b9d221739a5d0..e68fc626be981921ba9c0311f75a903a8538b60e 100644 --- a/llvm/test/CodeGen/Thumb/frame-chain.ll +++ b/llvm/test/CodeGen/Thumb/frame-chain.ll @@ -8,12 +8,16 @@ define dso_local noundef i32 @leaf(i32 noundef %0) { ; LEAF-FP-LABEL: leaf: ; LEAF-FP: @ %bb.0: -; LEAF-FP-NEXT: .pad #4 -; LEAF-FP-NEXT: sub sp, #4 -; LEAF-FP-NEXT: str r0, [sp] -; LEAF-FP-NEXT: adds r0, r0, #4 -; LEAF-FP-NEXT: add sp, #4 -; LEAF-FP-NEXT: bx lr +; LEAF-FP-NEXT: .save {r7, lr} +; LEAF-FP-NEXT: push {r7, lr} +; LEAF-FP-NEXT: .setfp r7, sp +; LEAF-FP-NEXT: add r7, sp, #0 +; LEAF-FP-NEXT: .pad #4 +; LEAF-FP-NEXT: sub sp, #4 +; LEAF-FP-NEXT: str r0, [sp] +; LEAF-FP-NEXT: adds r0, r0, #4 +; LEAF-FP-NEXT: add sp, #4 +; LEAF-FP-NEXT: pop {r7, pc} ; ; LEAF-FP-AAPCS-LABEL: leaf: ; LEAF-FP-AAPCS: @ %bb.0: diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir new file mode 100644 index 0000000000000000000000000000000000000000..865152068fdf7fc4fee4f9828836aa84e2b6ba6b --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/avoidmuls.mir @@ -0,0 +1,20 @@ +# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m33 -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MUL +# RUN: llc -mtriple=thumbv7m-none-eabi --run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MULS + +--- +name: test +body: | + bb.0: + $r2 = tMOVr $r0, 14, _ + $r0 = t2MOVi 1, 14, _, _ + $r0 = t2MUL $r2, killed $r0, 14, _ + tBX_RET 14, _, implicit $r0 + +... +# MUL-LABEL: test +# MUL: t2MUL +# MUL-NOT: tMUL + +# MULS-LABEL: test +# MULS: tMUL +# MULS-NOT: t2MUL \ No newline at end of file diff --git a/llvm/test/CodeGen/Thumb2/frame-pointer.ll b/llvm/test/CodeGen/Thumb2/frame-pointer.ll index ae3c1c8a50e2b4ecb7343f3c20d4fb05c66bbf47..85c919a50d88c1850bd9236acce9bd21b8c3d667 100644 --- a/llvm/test/CodeGen/Thumb2/frame-pointer.ll +++ b/llvm/test/CodeGen/Thumb2/frame-pointer.ll @@ -14,7 +14,7 @@ define void @leaf() { ; Leaf function, frame pointer is requested but we don't need any stack frame, ; so don't create a frame pointer. -define void @leaf_nofpelim() "frame-pointer"="all" { +define void @leaf_nofpelim() "frame-pointer"="none" { ; CHECK-LABEL: leaf_nofpelim: ; CHECK-NOT: push ; CHECK-NOT: sp diff --git a/llvm/test/CodeGen/Thumb2/frameless.ll b/llvm/test/CodeGen/Thumb2/frameless.ll index 01e0414de37d93197e9a46f730b97917d0990b74..44914136b1f83983aa561b1d3059bb2d73a98dd3 100644 --- a/llvm/test/CodeGen/Thumb2/frameless.ll +++ b/llvm/test/CodeGen/Thumb2/frameless.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=all | not grep mov -; RUN: llc < %s -mtriple=thumbv7-linux -frame-pointer=all | not grep mov +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=none | not grep mov +; RUN: llc < %s -mtriple=thumbv7-linux -frame-pointer=none | not grep mov define void @t() nounwind readnone { ret void diff --git a/llvm/test/CodeGen/Thumb2/frameless2.ll b/llvm/test/CodeGen/Thumb2/frameless2.ll index 4750527ae555cd4461c938104f62e8be74e1f591..4848deaf8a1e4c6677a661a7735464d62c350c11 100644 --- a/llvm/test/CodeGen/Thumb2/frameless2.ll +++ b/llvm/test/CodeGen/Thumb2/frameless2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=all | not grep r7 +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=none | not grep r7 %struct.noise3 = type { [3 x [17 x i32]] } %struct.noiseguard = type { i32, i32, i32 } diff --git a/llvm/test/CodeGen/Thumb2/machine-licm.ll b/llvm/test/CodeGen/Thumb2/machine-licm.ll index 5a2ec9280de770c422894a5602c7954c34aef324..a2f379f7b5438442ab857f29654b1b5c583161fc 100644 --- a/llvm/test/CodeGen/Thumb2/machine-licm.ll +++ b/llvm/test/CodeGen/Thumb2/machine-licm.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -frame-pointer=all | FileCheck %s -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=all | FileCheck %s --check-prefix=PIC +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -frame-pointer=none | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=none | FileCheck %s --check-prefix=PIC ; rdar://7353541 ; rdar://7354376 diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll new file mode 100644 index 0000000000000000000000000000000000000000..8bcf87130c540087b12c954c4a0fade236ac92bb --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=thumbv8.1m.main-none-eabi < %s --force-dwarf-frame-section -frame-pointer=all -mattr=+aapcs-frame-chain | FileCheck %s + +; int test1() { +; return 0; +; } +define i32 @test1() "sign-return-address"="non-leaf" { +; CHECK-LABEL: test1: +; CHECK: .cfi_sections .debug_frame +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: @ %bb.0: @ %entry +; CHECK-NEXT: pac r12, lr, sp +; CHECK-NEXT: .save {ra_auth_code} +; CHECK-NEXT: str r12, [sp, #-4]! +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: .cfi_offset ra_auth_code, -4 +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push.w {r11, lr} +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: .cfi_offset lr, -8 +; CHECK-NEXT: .cfi_offset r11, -12 +; CHECK-NEXT: .setfp r11, sp +; CHECK-NEXT: mov r11, sp +; CHECK-NEXT: .cfi_def_cfa_register r11 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: pop.w {r11, lr} +; CHECK-NEXT: ldr r12, [sp], #4 +; CHECK-NEXT: aut r12, lr, sp +; CHECK-NEXT: bx lr +entry: + ret i32 0 +} + +; void foo(int n) { +; int a[n]; +; bar(a); +; } +define dso_local void @test2(i32 noundef %n) "sign-return-address"="non-leaf" { +; CHECK-LABEL: test2: +; CHECK: .cfi_startproc +; CHECK-NEXT: @ %bb.0: @ %entry +; CHECK-NEXT: pac r12, lr, sp +; CHECK-NEXT: .save {r4, r7, ra_auth_code} +; CHECK-NEXT: push.w {r4, r7, r12} +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: .cfi_offset ra_auth_code, -4 +; CHECK-NEXT: .cfi_offset r7, -8 +; CHECK-NEXT: .cfi_offset r4, -12 +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push.w {r11, lr} +; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: .cfi_offset lr, -16 +; CHECK-NEXT: .cfi_offset r11, -20 +; CHECK-NEXT: .setfp r11, sp +; CHECK-NEXT: mov r11, sp +; CHECK-NEXT: .cfi_def_cfa_register r11 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: movs r1, #7 +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 +; CHECK-NEXT: bic r0, r0, #7 +; CHECK-NEXT: sub.w r0, sp, r0 +; CHECK-NEXT: mov sp, r0 +; CHECK-NEXT: bl take_ptr +; CHECK-NEXT: mov sp, r11 +; CHECK-NEXT: pop.w {r11, lr} +; CHECK-NEXT: pop.w {r4, r7, r12} +; CHECK-NEXT: aut r12, lr, sp +; CHECK-NEXT: bx lr +entry: + %vla = alloca i32, i32 %n, align 4 + call void @take_ptr(ptr noundef nonnull %vla) + ret void +} + +; void test3(int c, float e, int z) { +; if (c) +; knr(); +; take_ptr(alloca(z)); +; if (e) +; knr(); +; } +define void @test3(i32 noundef %c, float noundef %e, i32 noundef %z) "sign-return-address"="non-leaf" { +; CHECK-LABEL: test3: +; CHECK: .cfi_startproc +; CHECK-NEXT: @ %bb.0: @ %entry +; CHECK-NEXT: pac r12, lr, sp +; CHECK-NEXT: .save {r4, r5, r6, r7, ra_auth_code} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r12} +; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: .cfi_offset ra_auth_code, -4 +; CHECK-NEXT: .cfi_offset r7, -8 +; CHECK-NEXT: .cfi_offset r6, -12 +; CHECK-NEXT: .cfi_offset r5, -16 +; CHECK-NEXT: .cfi_offset r4, -20 +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push.w {r11, lr} +; CHECK-NEXT: .cfi_def_cfa_offset 28 +; CHECK-NEXT: .cfi_offset lr, -24 +; CHECK-NEXT: .cfi_offset r11, -28 +; CHECK-NEXT: .setfp r11, sp +; CHECK-NEXT: mov r11, sp +; CHECK-NEXT: .cfi_def_cfa_register r11 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: it ne +; CHECK-NEXT: blne knr +; CHECK-NEXT: adds r0, r5, #7 +; CHECK-NEXT: bic r0, r0, #7 +; CHECK-NEXT: sub.w r0, sp, r0 +; CHECK-NEXT: mov sp, r0 +; CHECK-NEXT: bl take_ptr +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bl __aeabi_fcmpeq +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: bleq knr +; CHECK-NEXT: mov sp, r11 +; CHECK-NEXT: pop.w {r11, lr} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r12} +; CHECK-NEXT: aut r12, lr, sp +; CHECK-NEXT: bx lr +entry: + %tobool.not = icmp eq i32 %c, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: ; preds = %entry + tail call void @knr() + br label %if.end + +if.end: ; preds = %if.then, %entry + %0 = alloca i8, i32 %z, align 8 + call void @take_ptr(ptr noundef nonnull %0) + %tobool1 = fcmp une float %e, 0.000000e+00 + br i1 %tobool1, label %if.then2, label %if.end3 + +if.then2: ; preds = %if.end + call void @knr() + br label %if.end3 + +if.end3: ; preds = %if.then2, %if.end + ret void +} + +declare void @knr(...) +declare void @take_ptr(ptr noundef) diff --git a/llvm/test/CodeGen/VE/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/VE/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..3b88bea46c4dd07bc5c1a116be3c997d0ba6e58c --- /dev/null +++ b/llvm/test/CodeGen/VE/naked-fn-with-frame-pointer.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple ve | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, main@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, main@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -240(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: lea %s0, main@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, main@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll index 8bc9c043fcf8ae0cb212df1e72fef23143ad7c62..7850559b49b7d78a01d39b1a8e38a553aa491fb5 100644 --- a/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll +++ b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll @@ -10,10 +10,11 @@ declare fp128 @llvm.cos.f128(fp128) declare fp128 @llvm.tan.f128(fp128) declare fp128 @llvm.asin.f128(fp128) declare fp128 @llvm.acos.f128(fp128) -declare fp128 @llvm.atan.f128.i32(fp128) +declare fp128 @llvm.atan.f128(fp128) declare fp128 @llvm.sinh.f128(fp128) declare fp128 @llvm.cosh.f128(fp128) declare fp128 @llvm.tanh.f128(fp128) +declare fp128 @llvm.atan2.f128(fp128, fp128) declare double @llvm.sin.f64(double) declare double @llvm.cos.f64(double) @@ -24,6 +25,7 @@ declare double @llvm.atan.f64(double) declare double @llvm.sinh.f64(double) declare double @llvm.cosh.f64(double) declare double @llvm.tanh.f64(double) +declare double @llvm.atan2.f64(double, double) declare float @llvm.sin.f32(float) declare float @llvm.cos.f32(float) @@ -34,6 +36,7 @@ declare float @llvm.atan.f32(float) declare float @llvm.sinh.f32(float) declare float @llvm.cosh.f32(float) declare float @llvm.tanh.f32(float) +declare float @llvm.atan2.f32(float, float) define fp128 @fp128libcalls(fp128 %x) { @@ -42,154 +45,171 @@ define fp128 @fp128libcalls(fp128 %x) { ; CHECK: .functype fp128libcalls (i32, i64, i64) -> () ; CHECK-NEXT: .local i32 ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: global.get $push28=, __stack_pointer -; CHECK-NEXT: i32.const $push29=, 144 -; CHECK-NEXT: i32.sub $push73=, $pop28, $pop29 -; CHECK-NEXT: local.tee $push72=, 3, $pop73 -; CHECK-NEXT: global.set __stack_pointer, $pop72 -; CHECK-NEXT: local.get $push74=, 3 -; CHECK-NEXT: i32.const $push62=, 128 -; CHECK-NEXT: i32.add $push63=, $pop74, $pop62 -; CHECK-NEXT: local.get $push76=, 1 -; CHECK-NEXT: local.get $push75=, 2 -; CHECK-NEXT: call sinl, $pop63, $pop76, $pop75 -; CHECK-NEXT: local.get $push77=, 3 -; CHECK-NEXT: i32.const $push58=, 112 -; CHECK-NEXT: i32.add $push59=, $pop77, $pop58 -; CHECK-NEXT: local.get $push78=, 3 -; CHECK-NEXT: i64.load $push3=, 128($pop78) -; CHECK-NEXT: local.get $push79=, 3 -; CHECK-NEXT: i32.const $push60=, 128 -; CHECK-NEXT: i32.add $push61=, $pop79, $pop60 -; CHECK-NEXT: i32.const $push0=, 8 -; CHECK-NEXT: i32.add $push1=, $pop61, $pop0 -; CHECK-NEXT: i64.load $push2=, 0($pop1) -; CHECK-NEXT: call cosl, $pop59, $pop3, $pop2 -; CHECK-NEXT: local.get $push80=, 3 -; CHECK-NEXT: i32.const $push54=, 96 -; CHECK-NEXT: i32.add $push55=, $pop80, $pop54 -; CHECK-NEXT: local.get $push81=, 3 -; CHECK-NEXT: i64.load $push6=, 112($pop81) -; CHECK-NEXT: local.get $push82=, 3 -; CHECK-NEXT: i32.const $push56=, 112 -; CHECK-NEXT: i32.add $push57=, $pop82, $pop56 -; CHECK-NEXT: i32.const $push71=, 8 -; CHECK-NEXT: i32.add $push4=, $pop57, $pop71 -; CHECK-NEXT: i64.load $push5=, 0($pop4) -; CHECK-NEXT: call tanl, $pop55, $pop6, $pop5 -; CHECK-NEXT: local.get $push83=, 3 -; CHECK-NEXT: i32.const $push50=, 80 -; CHECK-NEXT: i32.add $push51=, $pop83, $pop50 -; CHECK-NEXT: local.get $push84=, 3 -; CHECK-NEXT: i64.load $push9=, 96($pop84) -; CHECK-NEXT: local.get $push85=, 3 -; CHECK-NEXT: i32.const $push52=, 96 -; CHECK-NEXT: i32.add $push53=, $pop85, $pop52 -; CHECK-NEXT: i32.const $push70=, 8 -; CHECK-NEXT: i32.add $push7=, $pop53, $pop70 -; CHECK-NEXT: i64.load $push8=, 0($pop7) -; CHECK-NEXT: call asinl, $pop51, $pop9, $pop8 -; CHECK-NEXT: local.get $push86=, 3 -; CHECK-NEXT: i32.const $push46=, 64 -; CHECK-NEXT: i32.add $push47=, $pop86, $pop46 -; CHECK-NEXT: local.get $push87=, 3 -; CHECK-NEXT: i64.load $push12=, 80($pop87) -; CHECK-NEXT: local.get $push88=, 3 -; CHECK-NEXT: i32.const $push48=, 80 -; CHECK-NEXT: i32.add $push49=, $pop88, $pop48 -; CHECK-NEXT: i32.const $push69=, 8 -; CHECK-NEXT: i32.add $push10=, $pop49, $pop69 -; CHECK-NEXT: i64.load $push11=, 0($pop10) -; CHECK-NEXT: call acosl, $pop47, $pop12, $pop11 -; CHECK-NEXT: local.get $push89=, 3 -; CHECK-NEXT: i32.const $push42=, 48 -; CHECK-NEXT: i32.add $push43=, $pop89, $pop42 -; CHECK-NEXT: local.get $push90=, 3 -; CHECK-NEXT: i64.load $push15=, 64($pop90) -; CHECK-NEXT: local.get $push91=, 3 -; CHECK-NEXT: i32.const $push44=, 64 -; CHECK-NEXT: i32.add $push45=, $pop91, $pop44 -; CHECK-NEXT: i32.const $push68=, 8 -; CHECK-NEXT: i32.add $push13=, $pop45, $pop68 -; CHECK-NEXT: i64.load $push14=, 0($pop13) -; CHECK-NEXT: call atanl, $pop43, $pop15, $pop14 -; CHECK-NEXT: local.get $push92=, 3 -; CHECK-NEXT: i32.const $push38=, 32 -; CHECK-NEXT: i32.add $push39=, $pop92, $pop38 -; CHECK-NEXT: local.get $push93=, 3 -; CHECK-NEXT: i64.load $push18=, 48($pop93) -; CHECK-NEXT: local.get $push94=, 3 -; CHECK-NEXT: i32.const $push40=, 48 -; CHECK-NEXT: i32.add $push41=, $pop94, $pop40 -; CHECK-NEXT: i32.const $push67=, 8 -; CHECK-NEXT: i32.add $push16=, $pop41, $pop67 -; CHECK-NEXT: i64.load $push17=, 0($pop16) -; CHECK-NEXT: call sinhl, $pop39, $pop18, $pop17 -; CHECK-NEXT: local.get $push95=, 3 -; CHECK-NEXT: i32.const $push34=, 16 -; CHECK-NEXT: i32.add $push35=, $pop95, $pop34 -; CHECK-NEXT: local.get $push96=, 3 -; CHECK-NEXT: i64.load $push21=, 32($pop96) -; CHECK-NEXT: local.get $push97=, 3 -; CHECK-NEXT: i32.const $push36=, 32 -; CHECK-NEXT: i32.add $push37=, $pop97, $pop36 -; CHECK-NEXT: i32.const $push66=, 8 -; CHECK-NEXT: i32.add $push19=, $pop37, $pop66 -; CHECK-NEXT: i64.load $push20=, 0($pop19) -; CHECK-NEXT: call coshl, $pop35, $pop21, $pop20 -; CHECK-NEXT: local.get $push100=, 3 -; CHECK-NEXT: local.get $push98=, 3 -; CHECK-NEXT: i64.load $push24=, 16($pop98) -; CHECK-NEXT: local.get $push99=, 3 -; CHECK-NEXT: i32.const $push32=, 16 -; CHECK-NEXT: i32.add $push33=, $pop99, $pop32 -; CHECK-NEXT: i32.const $push65=, 8 -; CHECK-NEXT: i32.add $push22=, $pop33, $pop65 -; CHECK-NEXT: i64.load $push23=, 0($pop22) -; CHECK-NEXT: call tanhl, $pop100, $pop24, $pop23 -; CHECK-NEXT: local.get $push102=, 0 -; CHECK-NEXT: local.get $push101=, 3 -; CHECK-NEXT: i32.const $push64=, 8 -; CHECK-NEXT: i32.add $push25=, $pop101, $pop64 -; CHECK-NEXT: i64.load $push26=, 0($pop25) -; CHECK-NEXT: i64.store 8($pop102), $pop26 -; CHECK-NEXT: local.get $push104=, 0 -; CHECK-NEXT: local.get $push103=, 3 -; CHECK-NEXT: i64.load $push27=, 0($pop103) -; CHECK-NEXT: i64.store 0($pop104), $pop27 -; CHECK-NEXT: local.get $push105=, 3 -; CHECK-NEXT: i32.const $push30=, 144 -; CHECK-NEXT: i32.add $push31=, $pop105, $pop30 -; CHECK-NEXT: global.set __stack_pointer, $pop31 +; CHECK-NEXT: global.get $push31=, __stack_pointer +; CHECK-NEXT: i32.const $push32=, 160 +; CHECK-NEXT: i32.sub $push81=, $pop31, $pop32 +; CHECK-NEXT: local.tee $push80=, 3, $pop81 +; CHECK-NEXT: global.set __stack_pointer, $pop80 +; CHECK-NEXT: local.get $push82=, 3 +; CHECK-NEXT: i32.const $push69=, 144 +; CHECK-NEXT: i32.add $push70=, $pop82, $pop69 +; CHECK-NEXT: local.get $push84=, 1 +; CHECK-NEXT: local.get $push83=, 2 +; CHECK-NEXT: call sinl, $pop70, $pop84, $pop83 +; CHECK-NEXT: local.get $push85=, 3 +; CHECK-NEXT: i32.const $push65=, 128 +; CHECK-NEXT: i32.add $push66=, $pop85, $pop65 +; CHECK-NEXT: local.get $push86=, 3 +; CHECK-NEXT: i64.load $push3=, 144($pop86) +; CHECK-NEXT: local.get $push87=, 3 +; CHECK-NEXT: i32.const $push67=, 144 +; CHECK-NEXT: i32.add $push68=, $pop87, $pop67 +; CHECK-NEXT: i32.const $push0=, 8 +; CHECK-NEXT: i32.add $push1=, $pop68, $pop0 +; CHECK-NEXT: i64.load $push2=, 0($pop1) +; CHECK-NEXT: call cosl, $pop66, $pop3, $pop2 +; CHECK-NEXT: local.get $push88=, 3 +; CHECK-NEXT: i32.const $push61=, 112 +; CHECK-NEXT: i32.add $push62=, $pop88, $pop61 +; CHECK-NEXT: local.get $push89=, 3 +; CHECK-NEXT: i64.load $push6=, 128($pop89) +; CHECK-NEXT: local.get $push90=, 3 +; CHECK-NEXT: i32.const $push63=, 128 +; CHECK-NEXT: i32.add $push64=, $pop90, $pop63 +; CHECK-NEXT: i32.const $push79=, 8 +; CHECK-NEXT: i32.add $push4=, $pop64, $pop79 +; CHECK-NEXT: i64.load $push5=, 0($pop4) +; CHECK-NEXT: call tanl, $pop62, $pop6, $pop5 +; CHECK-NEXT: local.get $push91=, 3 +; CHECK-NEXT: i32.const $push57=, 96 +; CHECK-NEXT: i32.add $push58=, $pop91, $pop57 +; CHECK-NEXT: local.get $push92=, 3 +; CHECK-NEXT: i64.load $push9=, 112($pop92) +; CHECK-NEXT: local.get $push93=, 3 +; CHECK-NEXT: i32.const $push59=, 112 +; CHECK-NEXT: i32.add $push60=, $pop93, $pop59 +; CHECK-NEXT: i32.const $push78=, 8 +; CHECK-NEXT: i32.add $push7=, $pop60, $pop78 +; CHECK-NEXT: i64.load $push8=, 0($pop7) +; CHECK-NEXT: call asinl, $pop58, $pop9, $pop8 +; CHECK-NEXT: local.get $push94=, 3 +; CHECK-NEXT: i32.const $push53=, 80 +; CHECK-NEXT: i32.add $push54=, $pop94, $pop53 +; CHECK-NEXT: local.get $push95=, 3 +; CHECK-NEXT: i64.load $push12=, 96($pop95) +; CHECK-NEXT: local.get $push96=, 3 +; CHECK-NEXT: i32.const $push55=, 96 +; CHECK-NEXT: i32.add $push56=, $pop96, $pop55 +; CHECK-NEXT: i32.const $push77=, 8 +; CHECK-NEXT: i32.add $push10=, $pop56, $pop77 +; CHECK-NEXT: i64.load $push11=, 0($pop10) +; CHECK-NEXT: call acosl, $pop54, $pop12, $pop11 +; CHECK-NEXT: local.get $push97=, 3 +; CHECK-NEXT: i32.const $push49=, 64 +; CHECK-NEXT: i32.add $push50=, $pop97, $pop49 +; CHECK-NEXT: local.get $push98=, 3 +; CHECK-NEXT: i64.load $push15=, 80($pop98) +; CHECK-NEXT: local.get $push99=, 3 +; CHECK-NEXT: i32.const $push51=, 80 +; CHECK-NEXT: i32.add $push52=, $pop99, $pop51 +; CHECK-NEXT: i32.const $push76=, 8 +; CHECK-NEXT: i32.add $push13=, $pop52, $pop76 +; CHECK-NEXT: i64.load $push14=, 0($pop13) +; CHECK-NEXT: call atanl, $pop50, $pop15, $pop14 +; CHECK-NEXT: local.get $push100=, 3 +; CHECK-NEXT: i32.const $push45=, 48 +; CHECK-NEXT: i32.add $push46=, $pop100, $pop45 +; CHECK-NEXT: local.get $push101=, 3 +; CHECK-NEXT: i64.load $push18=, 64($pop101) +; CHECK-NEXT: local.get $push102=, 3 +; CHECK-NEXT: i32.const $push47=, 64 +; CHECK-NEXT: i32.add $push48=, $pop102, $pop47 +; CHECK-NEXT: i32.const $push75=, 8 +; CHECK-NEXT: i32.add $push16=, $pop48, $pop75 +; CHECK-NEXT: i64.load $push17=, 0($pop16) +; CHECK-NEXT: call sinhl, $pop46, $pop18, $pop17 +; CHECK-NEXT: local.get $push103=, 3 +; CHECK-NEXT: i32.const $push41=, 32 +; CHECK-NEXT: i32.add $push42=, $pop103, $pop41 +; CHECK-NEXT: local.get $push104=, 3 +; CHECK-NEXT: i64.load $push21=, 48($pop104) +; CHECK-NEXT: local.get $push105=, 3 +; CHECK-NEXT: i32.const $push43=, 48 +; CHECK-NEXT: i32.add $push44=, $pop105, $pop43 +; CHECK-NEXT: i32.const $push74=, 8 +; CHECK-NEXT: i32.add $push19=, $pop44, $pop74 +; CHECK-NEXT: i64.load $push20=, 0($pop19) +; CHECK-NEXT: call coshl, $pop42, $pop21, $pop20 +; CHECK-NEXT: local.get $push106=, 3 +; CHECK-NEXT: i32.const $push37=, 16 +; CHECK-NEXT: i32.add $push38=, $pop106, $pop37 +; CHECK-NEXT: local.get $push107=, 3 +; CHECK-NEXT: i64.load $push24=, 32($pop107) +; CHECK-NEXT: local.get $push108=, 3 +; CHECK-NEXT: i32.const $push39=, 32 +; CHECK-NEXT: i32.add $push40=, $pop108, $pop39 +; CHECK-NEXT: i32.const $push73=, 8 +; CHECK-NEXT: i32.add $push22=, $pop40, $pop73 +; CHECK-NEXT: i64.load $push23=, 0($pop22) +; CHECK-NEXT: call tanhl, $pop38, $pop24, $pop23 +; CHECK-NEXT: local.get $push113=, 3 +; CHECK-NEXT: local.get $push112=, 1 +; CHECK-NEXT: local.get $push111=, 2 +; CHECK-NEXT: local.get $push109=, 3 +; CHECK-NEXT: i64.load $push27=, 16($pop109) +; CHECK-NEXT: local.get $push110=, 3 +; CHECK-NEXT: i32.const $push35=, 16 +; CHECK-NEXT: i32.add $push36=, $pop110, $pop35 +; CHECK-NEXT: i32.const $push72=, 8 +; CHECK-NEXT: i32.add $push25=, $pop36, $pop72 +; CHECK-NEXT: i64.load $push26=, 0($pop25) +; CHECK-NEXT: call atan2l, $pop113, $pop112, $pop111, $pop27, $pop26 +; CHECK-NEXT: local.get $push115=, 0 +; CHECK-NEXT: local.get $push114=, 3 +; CHECK-NEXT: i32.const $push71=, 8 +; CHECK-NEXT: i32.add $push28=, $pop114, $pop71 +; CHECK-NEXT: i64.load $push29=, 0($pop28) +; CHECK-NEXT: i64.store 8($pop115), $pop29 +; CHECK-NEXT: local.get $push117=, 0 +; CHECK-NEXT: local.get $push116=, 3 +; CHECK-NEXT: i64.load $push30=, 0($pop116) +; CHECK-NEXT: i64.store 0($pop117), $pop30 +; CHECK-NEXT: local.get $push118=, 3 +; CHECK-NEXT: i32.const $push33=, 160 +; CHECK-NEXT: i32.add $push34=, $pop118, $pop33 +; CHECK-NEXT: global.set __stack_pointer, $pop34 ; CHECK-NEXT: return ; libm calls %d = call fp128 @llvm.sin.f128(fp128 %x) %e = call fp128 @llvm.cos.f128(fp128 %d) %f = call fp128 @llvm.tan.f128(fp128 %e) - %g = call fp128 @llvm.asin.f128.i32(fp128 %f) + %g = call fp128 @llvm.asin.f128(fp128 %f) %h = call fp128 @llvm.acos.f128(fp128 %g) %i = call fp128 @llvm.atan.f128(fp128 %h) %a = call fp128 @llvm.sinh.f128(fp128 %i) %b = call fp128 @llvm.cosh.f128(fp128 %a) %c = call fp128 @llvm.tanh.f128(fp128 %b) - ret fp128 %c + %j = call fp128 @llvm.atan2.f128(fp128 %x, fp128 %c) + ret fp128 %j } define double @f64libcalls(double %x) { ; CHECK-LABEL: f64libcalls: ; CHECK: .functype f64libcalls (f64) -> (f64) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get $push9=, 0 -; CHECK-NEXT: call $push0=, sin, $pop9 -; CHECK-NEXT: call $push1=, cos, $pop0 -; CHECK-NEXT: call $push2=, tan, $pop1 -; CHECK-NEXT: call $push3=, asin, $pop2 -; CHECK-NEXT: call $push4=, acos, $pop3 -; CHECK-NEXT: call $push5=, atan, $pop4 -; CHECK-NEXT: call $push6=, sinh, $pop5 -; CHECK-NEXT: call $push7=, cosh, $pop6 -; CHECK-NEXT: call $push8=, tanh, $pop7 -; CHECK-NEXT: return $pop8 +; CHECK-NEXT: local.get $push11=, 0 +; CHECK-NEXT: local.get $push10=, 0 +; CHECK-NEXT: call $push0=, sin, $pop10 +; CHECK-NEXT: call $push1=, cos, $pop0 +; CHECK-NEXT: call $push2=, tan, $pop1 +; CHECK-NEXT: call $push3=, asin, $pop2 +; CHECK-NEXT: call $push4=, acos, $pop3 +; CHECK-NEXT: call $push5=, atan, $pop4 +; CHECK-NEXT: call $push6=, sinh, $pop5 +; CHECK-NEXT: call $push7=, cosh, $pop6 +; CHECK-NEXT: call $push8=, tanh, $pop7 +; CHECK-NEXT: call $push9=, atan2, $pop11, $pop8 +; CHECK-NEXT: return $pop9 %k = call double @llvm.sin.f64(double %x) @@ -201,24 +221,27 @@ define double @f64libcalls(double %x) { %f = call double @llvm.sinh.f64(double %e) %g = call double @llvm.cosh.f64(double %f) %h = call double @llvm.tanh.f64(double %g) - ret double %h + %i = call double @llvm.atan2.f64(double %x, double %h) + ret double %i } define float @f32libcalls(float %x) { ; CHECK-LABEL: f32libcalls: ; CHECK: .functype f32libcalls (f32) -> (f32) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get $push9=, 0 -; CHECK-NEXT: call $push0=, sinf, $pop9 -; CHECK-NEXT: call $push1=, cosf, $pop0 -; CHECK-NEXT: call $push2=, tanf, $pop1 -; CHECK-NEXT: call $push3=, asinf, $pop2 -; CHECK-NEXT: call $push4=, acosf, $pop3 -; CHECK-NEXT: call $push5=, atanf, $pop4 -; CHECK-NEXT: call $push6=, sinhf, $pop5 -; CHECK-NEXT: call $push7=, coshf, $pop6 -; CHECK-NEXT: call $push8=, tanhf, $pop7 -; CHECK-NEXT: return $pop8 +; CHECK-NEXT: local.get $push11=, 0 +; CHECK-NEXT: local.get $push10=, 0 +; CHECK-NEXT: call $push0=, sinf, $pop10 +; CHECK-NEXT: call $push1=, cosf, $pop0 +; CHECK-NEXT: call $push2=, tanf, $pop1 +; CHECK-NEXT: call $push3=, asinf, $pop2 +; CHECK-NEXT: call $push4=, acosf, $pop3 +; CHECK-NEXT: call $push5=, atanf, $pop4 +; CHECK-NEXT: call $push6=, sinhf, $pop5 +; CHECK-NEXT: call $push7=, coshf, $pop6 +; CHECK-NEXT: call $push8=, tanhf, $pop7 +; CHECK-NEXT: call $push9=, atan2f, $pop11, $pop8 +; CHECK-NEXT: return $pop9 %k = call float @llvm.sin.f32(float %x) @@ -230,5 +253,6 @@ define float @f32libcalls(float %x) { %f = call float @llvm.sinh.f32(float %e) %g = call float @llvm.cosh.f32(float %f) %h = call float @llvm.tanh.f32(float %g) - ret float %h + %i = call float @llvm.atan2.f32(float %x, float %h) + ret float %i } diff --git a/llvm/test/CodeGen/WebAssembly/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/WebAssembly/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..fcd42e8cbfb9f5f2885b1b29ef6cf2b21eb7fe1f --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/naked-fn-with-frame-pointer.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple wasm32 | FileCheck %s -check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple wasm64 | FileCheck %s -check-prefixes=CHECK-64 + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-LABEL: naked: +; CHECK-32: .functype naked () -> () +; CHECK-32-NEXT: # %bb.0: +; CHECK-32-NEXT: call main +; CHECK-32-NEXT: unreachable +; +; CHECK-64-LABEL: naked: +; CHECK-64: .functype naked () -> () +; CHECK-64-NEXT: # %bb.0: +; CHECK-64-NEXT: call main +; CHECK-64-NEXT: unreachable + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-LABEL: normal: +; CHECK-32: .functype normal () -> () +; CHECK-32-NEXT: # %bb.0: +; CHECK-32-NEXT: call main +; CHECK-32-NEXT: unreachable +; +; CHECK-64-LABEL: normal: +; CHECK-64: .functype normal () -> () +; CHECK-64-NEXT: # %bb.0: +; CHECK-64-NEXT: call main +; CHECK-64-NEXT: unreachable + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll new file mode 100644 index 0000000000000000000000000000000000000000..46ebe6ba76567a7b687511d9e38f3545bd8afb7f --- /dev/null +++ b/llvm/test/CodeGen/X86/andnot-patterns.ll @@ -0,0 +1,626 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-- -mattr=+bmi | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64 + +; TODO - PR112425 - attempt to reconstruct andnot patterns through bitwise-agnostic operations + +declare void @use_i64(i64) + +; +; Fold (and X, (rotl (not Y), Z))) -> (and X, (not (rotl Y, Z))) +; + +define i64 @andnot_rotl_i64(i64 %a0, i64 %a1, i64 %a2) nounwind { +; X86-LABEL: andnot_rotl_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: notl %esi +; X86-NEXT: notl %edx +; X86-NEXT: testb $32, %cl +; X86-NEXT: jne .LBB0_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: movl %edx, %eax +; X86-NEXT: jmp .LBB0_3 +; X86-NEXT: .LBB0_1: +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: .LBB0_3: +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotl_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: notq %rax +; X64-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-NEXT: rolq %cl, %rax +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: retq + %not = xor i64 %a1, -1 + %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2) + %and = and i64 %rot, %a0 + ret i64 %and +} + +define i32 @andnot_rotl_i32(i32 %a0, i32 %a1, i32 %a2) nounwind { +; X86-LABEL: andnot_rotl_i32: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: roll %cl, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotl_i32: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notl %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: roll %cl, %eax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: retq + %not = xor i32 %a1, -1 + %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a2) + %and = and i32 %rot, %a0 + ret i32 %and +} + +define i16 @andnot_rotl_i16(i16 %a0, i16 %a1, i16 %a2) nounwind { +; X86-LABEL: andnot_rotl_i16: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: rolw %cl, %ax +; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotl_i16: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notl %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: rolw %cl, %ax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %not = xor i16 %a1, -1 + %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a2) + %and = and i16 %rot, %a0 + ret i16 %and +} + +define i8 @andnot_rotl_i8(i8 %a0, i8 %a1, i8 %a2) nounwind { +; X86-LABEL: andnot_rotl_i8: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notb %al +; X86-NEXT: rolb %cl, %al +; X86-NEXT: andb {{[0-9]+}}(%esp), %al +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotl_i8: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notb %al +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: rolb %cl, %al +; X64-NEXT: andb %dil, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %not = xor i8 %a1, -1 + %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a2) + %and = and i8 %rot, %a0 + ret i8 %and +} + +define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1, i64 %a2) nounwind { +; X86-LABEL: andnot_rotl_i64_multiuse: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: notl %edx +; X86-NEXT: notl %esi +; X86-NEXT: testb $32, %cl +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: movl %esi, %eax +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %esi, %edx +; X86-NEXT: .LBB4_3: +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: andl %ebx, %edi +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %eax +; X86-NEXT: calll use_i64@PLT +; X86-NEXT: addl $8, %esp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotl_i64_multiuse: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: notq %rsi +; X64-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-NEXT: rolq %cl, %rsi +; X64-NEXT: andq %rsi, %rbx +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: callq use_i64@PLT +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: popq %rbx +; X64-NEXT: retq + %not = xor i64 %a1, -1 + %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2) + %and = and i64 %rot, %a0 + call void @use_i64(i64 %rot) + ret i64 %and +} + +; +; Fold (and X, (rotr (not Y), Z))) -> (and X, (not (rotr Y, Z))) +; + +define i64 @andnot_rotr_i64(i64 %a0, i64 %a1, i64 %a2) nounwind { +; X86-LABEL: andnot_rotr_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: notl %esi +; X86-NEXT: notl %edx +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB5_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: movl %edx, %eax +; X86-NEXT: jmp .LBB5_3 +; X86-NEXT: .LBB5_1: +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: .LBB5_3: +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrdl %cl, %eax, %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %esi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotr_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: notq %rax +; X64-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-NEXT: rorq %cl, %rax +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: retq + %not = xor i64 %a1, -1 + %rot = tail call i64 @llvm.fshr.i64(i64 %not, i64 %not, i64 %a2) + %and = and i64 %rot, %a0 + ret i64 %and +} + +define i32 @andnot_rotr_i32(i32 %a0, i32 %a1, i32 %a2) nounwind { +; X86-LABEL: andnot_rotr_i32: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: rorl %cl, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotr_i32: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notl %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: rorl %cl, %eax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: retq + %not = xor i32 %a1, -1 + %rot = tail call i32 @llvm.fshr.i32(i32 %not, i32 %not, i32 %a2) + %and = and i32 %rot, %a0 + ret i32 %and +} + +define i16 @andnot_rotr_i16(i16 %a0, i16 %a1, i16 %a2) nounwind { +; X86-LABEL: andnot_rotr_i16: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: rorw %cl, %ax +; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotr_i16: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notl %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: rorw %cl, %ax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %not = xor i16 %a1, -1 + %rot = tail call i16 @llvm.fshr.i16(i16 %not, i16 %not, i16 %a2) + %and = and i16 %rot, %a0 + ret i16 %and +} + +define i8 @andnot_rotr_i8(i8 %a0, i8 %a1, i8 %a2) nounwind { +; X86-LABEL: andnot_rotr_i8: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notb %al +; X86-NEXT: rorb %cl, %al +; X86-NEXT: andb {{[0-9]+}}(%esp), %al +; X86-NEXT: retl +; +; X64-LABEL: andnot_rotr_i8: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notb %al +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: rorb %cl, %al +; X64-NEXT: andb %dil, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %not = xor i8 %a1, -1 + %rot = tail call i8 @llvm.fshr.i8(i8 %not, i8 %not, i8 %a2) + %and = and i8 %rot, %a0 + ret i8 %and +} + +; +; Fold (and X, (bswap (not Y)))) -> (and X, (not (bswap Y))) +; + +define i64 @andnot_bswap_i64(i64 %a0, i64 %a1) nounwind { +; X86-LABEL: andnot_bswap_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: notl %edx +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: retl +; +; X64-LABEL: andnot_bswap_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: notq %rax +; X64-NEXT: bswapq %rax +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: retq + %not = xor i64 %a1, -1 + %bswap = tail call i64 @llvm.bswap.i64(i64 %not) + %and = and i64 %bswap, %a0 + ret i64 %and +} + +define i32 @andnot_bswap_i32(i32 %a0, i32 %a1) nounwind { +; X86-LABEL: andnot_bswap_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_bswap_i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notl %eax +; X64-NEXT: bswapl %eax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: retq + %not = xor i32 %a1, -1 + %bswap = tail call i32 @llvm.bswap.i32(i32 %not) + %and = and i32 %bswap, %a0 + ret i32 %and +} + +define i16 @andnot_bswap_i16(i16 %a0, i16 %a1) nounwind { +; X86-LABEL: andnot_bswap_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: rolw $8, %ax +; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_bswap_i16: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: notl %eax +; X64-NEXT: rolw $8, %ax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %not = xor i16 %a1, -1 + %bswap = tail call i16 @llvm.bswap.i16(i16 %not) + %and = and i16 %bswap, %a0 + ret i16 %and +} + +; +; Fold (and X, (bitreverse (not Y)))) -> (and X, (not (bitreverse Y))) +; + +define i64 @andnot_bitreverse_i64(i64 %a0, i64 %a1) nounwind { +; X86-LABEL: andnot_bitreverse_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: notl %ecx +; X86-NEXT: bswapl %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %edx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NEXT: shrl $2, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: leal (%ecx,%edx,4), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NEXT: shrl %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: leal (%ecx,%edx,2), %edx +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: retl +; +; X64-LABEL: andnot_bitreverse_i64: +; X64: # %bb.0: +; X64-NEXT: notq %rsi +; X64-NEXT: bswapq %rsi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: shrq $4, %rax +; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: shlq $4, %rsi +; X64-NEXT: orq %rax, %rsi +; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: andq %rax, %rsi +; X64-NEXT: leaq (%rsi,%rcx,4), %rax +; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: shrq %rax +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: retq + %not = xor i64 %a1, -1 + %bitrev = tail call i64 @llvm.bitreverse.i64(i64 %not) + %and = and i64 %bitrev, %a0 + ret i64 %and +} + +define i32 @andnot_bitreverse_i32(i32 %a0, i32 %a1) nounwind { +; X86-LABEL: andnot_bitreverse_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: shrl %eax +; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_bitreverse_i32: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: notl %esi +; X64-NEXT: bswapl %esi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X64-NEXT: shll $4, %eax +; X64-NEXT: shrl $4, %esi +; X64-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F +; X64-NEXT: orl %eax, %esi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: shrl $2, %esi +; X64-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X64-NEXT: leal (%rsi,%rax,4), %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X64-NEXT: shrl %eax +; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: leal (%rax,%rcx,2), %eax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: retq + %not = xor i32 %a1, -1 + %bitrev = tail call i32 @llvm.bitreverse.i32(i32 %not) + %and = and i32 %bitrev, %a0 + ret i32 %and +} + +define i16 @andnot_bitreverse_i16(i16 %a0, i16 %a1) nounwind { +; X86-LABEL: andnot_bitreverse_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notl %eax +; X86-NEXT: rolw $8, %ax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $3855, %ecx # imm = 0xF0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: shrl $4, %eax +; X86-NEXT: andl $3855, %eax # imm = 0xF0F +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $13107, %ecx # imm = 0x3333 +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $13107, %eax # imm = 0x3333 +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $21845, %ecx # imm = 0x5555 +; X86-NEXT: shrl %eax +; X86-NEXT: andl $21845, %eax # imm = 0x5555 +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: andnot_bitreverse_i16: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: notl %esi +; X64-NEXT: rolw $8, %si +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $3855, %eax # imm = 0xF0F +; X64-NEXT: shll $4, %eax +; X64-NEXT: shrl $4, %esi +; X64-NEXT: andl $3855, %esi # imm = 0xF0F +; X64-NEXT: orl %eax, %esi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $13107, %eax # imm = 0x3333 +; X64-NEXT: shrl $2, %esi +; X64-NEXT: andl $13107, %esi # imm = 0x3333 +; X64-NEXT: leal (%rsi,%rax,4), %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $21845, %ecx # imm = 0x5555 +; X64-NEXT: shrl %eax +; X64-NEXT: andl $21845, %eax # imm = 0x5555 +; X64-NEXT: leal (%rax,%rcx,2), %eax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %not = xor i16 %a1, -1 + %bitrev = tail call i16 @llvm.bitreverse.i16(i16 %not) + %and = and i16 %bitrev, %a0 + ret i16 %and +} + +define i8 @andnot_bitreverse_i8(i8 %a0, i8 %a1) nounwind { +; X86-LABEL: andnot_bitreverse_i8: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: notb %al +; X86-NEXT: rolb $4, %al +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andb $51, %cl +; X86-NEXT: shlb $2, %cl +; X86-NEXT: shrb $2, %al +; X86-NEXT: andb $51, %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andb $85, %cl +; X86-NEXT: addb %cl, %cl +; X86-NEXT: shrb %al +; X86-NEXT: andb $85, %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: andb {{[0-9]+}}(%esp), %al +; X86-NEXT: retl +; +; X64-LABEL: andnot_bitreverse_i8: +; X64: # %bb.0: +; X64-NEXT: notb %sil +; X64-NEXT: rolb $4, %sil +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andb $51, %al +; X64-NEXT: shlb $2, %al +; X64-NEXT: shrb $2, %sil +; X64-NEXT: andb $51, %sil +; X64-NEXT: orb %sil, %al +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andb $85, %cl +; X64-NEXT: addb %cl, %cl +; X64-NEXT: shrb %al +; X64-NEXT: andb $85, %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: andb %dil, %al +; X64-NEXT: retq + %not = xor i8 %a1, -1 + %bitrev = tail call i8 @llvm.bitreverse.i8(i8 %not) + %and = and i8 %bitrev, %a0 + ret i8 %and +} diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll index 175d21a4f7067c8615d1dd171ea5d656be229976..55b1cdeddb853e62d61eb618961869327bb20b58 100644 --- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll +++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll @@ -808,7 +808,7 @@ define <2 x i64> @or_and_v2i64(<2 x i64> %a0) { ; AVX512-LABEL: or_and_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] -; AVX512-NEXT: vpternlogq $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | mem) ; AVX512-NEXT: retq %1 = and <2 x i64> %a0, %2 = or <2 x i64> %1, @@ -837,7 +837,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) { ; AVX512-LABEL: or_and_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7] -; AVX512-NEXT: vpternlogd $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem) ; AVX512-NEXT: retq %1 = and <4 x i32> %a0, %2 = or <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 8e424664363bbfdf9007c05a45c1d06d1b2d7e5f..2b392e69297f07ca522240db26d4222121039b3c 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -974,7 +974,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll index bb87252e0b9b085c5d059211769ece0058d08be2..3577f252f50dac6a35b753c8b590682497ab62a8 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -2962,6 +2962,64 @@ entry: ret double %result } +; Verify that atan2(42.1, 3.0) isn't simplified when the rounding mode is unknown. +define double @fatan2() #0 { +; X87-LABEL: fatan2: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $28, %esp +; X87-NEXT: .cfi_def_cfa_offset 32 +; X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X87-NEXT: fstpl {{[0-9]+}}(%esp) +; X87-NEXT: fldl {{\.?LCPI[0-9]+_[0-9]+}} +; X87-NEXT: fstpl (%esp) +; X87-NEXT: wait +; X87-NEXT: calll atan2 +; X87-NEXT: addl $28, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: fatan2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $28, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0] +; X86-SSE-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: calll atan2 +; X86-SSE-NEXT: addl $28, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: fatan2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rax +; SSE-NEXT: .cfi_def_cfa_offset 16 +; SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0] +; SSE-NEXT: callq atan2@PLT +; SSE-NEXT: popq %rax +; SSE-NEXT: .cfi_def_cfa_offset 8 +; SSE-NEXT: retq +; +; AVX-LABEL: fatan2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.atan2.f64(double 42.1, + double 3.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that cosh(42.0) isn't simplified when the rounding mode is unknown. define double @fcosh() #0 { ; X87-LABEL: fcosh: @@ -3132,6 +3190,7 @@ declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata declare double @llvm.experimental.constrained.asin.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.cosh.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.tanh.f64(double, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll index 5d4e86afc8aceba1cd03fbe084e0d2d436fbf4a4..74291fbb75e81af4be29473c77ff7ab59ef40f15 100644 --- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll +++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll @@ -228,6 +228,26 @@ define float @atan(float %x) #0 { ret float %result } +define float @atan2(float %x, float %y) #0 { +; CHECK-LABEL: atan2: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $20, %esp +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl (%esp) +; CHECK-NEXT: wait +; CHECK-NEXT: calll _atan2 +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: wait +; CHECK-NEXT: addl $20, %esp +; CHECK-NEXT: retl + %result = call float @llvm.experimental.constrained.atan2.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret float %result +} + define float @cosh(float %x) #0 { ; CHECK-LABEL: cosh: ; CHECK: # %bb.0: @@ -263,6 +283,7 @@ define float @sinh(float %x) #0 { } define float @tanh(float %x) #0 { +; CHECK-LABEL: tanh: ; CHECK: # %bb.0: ; CHECK-NEXT: subl $12, %esp ; CHECK-NEXT: flds {{[0-9]+}}(%esp) @@ -293,6 +314,7 @@ declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.atan2.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll index 84574e3691848b5376153096f382cb405fc4d888..ffaa9f6297ed8c49a551b2768280fe66d0c9d0f9 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -1247,6 +1247,50 @@ entry: ret fp128 %atan } +define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp { +; ANDROID-LABEL: atan2: +; ANDROID: # %bb.0: # %entry +; ANDROID-NEXT: pushq %rax +; ANDROID-NEXT: callq atan2l@PLT +; ANDROID-NEXT: popq %rax +; ANDROID-NEXT: retq +; +; GNU-LABEL: atan2: +; GNU: # %bb.0: # %entry +; GNU-NEXT: pushq %rax +; GNU-NEXT: callq atan2f128@PLT +; GNU-NEXT: popq %rax +; GNU-NEXT: retq +; +; X86-LABEL: atan2: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: subl $24, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl $12, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %eax +; X86-NEXT: calll atan2l +; X86-NEXT: addl $44, %esp +; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: movaps %xmm0, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $24, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl $4 +entry: + %atan2 = call fp128 @llvm.experimental.constrained.atan2.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret fp128 %atan2 +} + define fp128 @tan(fp128 %x) nounwind strictfp { ; ANDROID-LABEL: tan: ; ANDROID: # %bb.0: # %entry @@ -1948,6 +1992,7 @@ declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.sinh.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.atan.f128(fp128, metadata, metadata) +declare fp128 @llvm.experimental.constrained.atan2.f128(fp128, fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.tanh.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata) diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll index 293133b08e761a1eb384253a2c5d03f23a5c00e7..8bbc6247dbafd6145d5ae7b3b9c6c8696645cb20 100644 --- a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll @@ -629,6 +629,35 @@ entry: ret x86_fp80 %atan } +define x86_fp80 @atan2(x86_fp80 %x, x86_fp80 %y) nounwind strictfp { +; X86-LABEL: atan2: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $24, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll atan2l +; X86-NEXT: addl $24, %esp +; X86-NEXT: retl +; +; X64-LABEL: atan2: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq atan2l@PLT +; X64-NEXT: addq $40, %rsp +; X64-NEXT: retq +entry: + %atan2 = call x86_fp80 @llvm.experimental.constrained.atan2.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %atan2 +} + define x86_fp80 @tan(x86_fp80 %x) nounwind strictfp { ; X86-LABEL: tan: ; X86: # %bb.0: # %entry @@ -830,6 +859,7 @@ declare x86_fp80 @llvm.experimental.constrained.asin.f80(x86_fp80, metadata, met declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata) declare x86_fp80 @llvm.experimental.constrained.sinh.f80(x86_fp80, metadata, metadata) declare x86_fp80 @llvm.experimental.constrained.atan.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.atan2.f80(x86_fp80, x86_fp80, metadata, metadata) declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata) declare x86_fp80 @llvm.experimental.constrained.tanh.f80(x86_fp80, metadata, metadata) declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) diff --git a/llvm/test/CodeGen/X86/llvm.atan2.ll b/llvm/test/CodeGen/X86/llvm.atan2.ll new file mode 100644 index 0000000000000000000000000000000000000000..ef2e4be36203bef6ac4f9bdb0bf271452edfcbfa --- /dev/null +++ b/llvm/test/CodeGen/X86/llvm.atan2.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define half @use_atan2f16(half %a, half %b) nounwind { +; CHECK-LABEL: use_atan2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq atan2f@PLT +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %x = call half @llvm.atan2.f16(half %a, half %b) + ret half %x +} + +define float @use_atan2f32(float %a, float %b) nounwind { +; CHECK-LABEL: use_atan2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp atan2f@PLT # TAILCALL + %x = call float @llvm.atan2.f32(float %a, float %b) + ret float %x +} + +define double @use_atan2f64(double %a, double %b) nounwind { +; CHECK-LABEL: use_atan2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp atan2@PLT # TAILCALL + %x = call double @llvm.atan2.f64(double %a, double %b) + ret double %x +} + +define x86_fp80 @use_atan2f80(x86_fp80 %a, x86_fp80 %b) nounwind { +; CHECK-LABEL: use_atan2f80: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq atan2l@PLT +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %x = call x86_fp80 @llvm.atan2.f80(x86_fp80 %a, x86_fp80 %b) + ret x86_fp80 %x +} + +define fp128 @use_atan2fp128(fp128 %a, fp128 %b) nounwind { +; CHECK-LABEL: use_atan2fp128: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp atan2f128@PLT # TAILCALL + %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b) + ret fp128 %x +} + +define ppc_fp128 @use_atan2ppc_fp128(ppc_fp128 %a, ppc_fp128 %b) nounwind { +; CHECK-LABEL: use_atan2ppc_fp128: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq atan2l@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %x = call ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128 %a, ppc_fp128 %b) + ret ppc_fp128 %x +} + +declare half @llvm.atan2.f16(half, half) +declare float @llvm.atan2.f32(float, float) +declare double @llvm.atan2.f64(double, double) +declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) +declare fp128 @llvm.atan2.f128(fp128, fp128) +declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128, ppc_fp128) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index f4a0207dafde7cd3a020ceb37111fcc64e9b9f95..1e56f346030ca18764bcfbd1b28101568468addb 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -6403,7 +6403,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 487f7298f442c2d55a862e68b9f27c2cca264251..da4432bd88e7db1d49f03952d81bc4bedd65e3db 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -7298,7 +7298,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 498f250f11c690fbc6ca66f2ffbf0649945c8567..1597e13f0271973f1d4d95d4f8c7896576afb0e5 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -7148,7 +7148,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index acaa9be3f89a23f526a19036dc37200eca449d70..6fd3db3464decba2a4a7f8fb561ab1933eb2145b 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -1928,7 +1928,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 ; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX512F-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubw %xmm2, %xmm1, %xmm1 @@ -1945,7 +1945,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 @@ -2500,7 +2500,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2706,7 +2706,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 ; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX512F-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubb %xmm2, %xmm1, %xmm1 @@ -2728,8 +2728,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2961,7 +2961,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm0, %xmm2, %xmm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -3192,7 +3192,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -3432,7 +3432,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 7d882b772a64d19174f184de0a897fb7f61b6ad0..5a1c4c8a52c82934a8f85e0ce6b6e36b29903de1 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1434,7 +1434,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 ; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1 @@ -1450,7 +1450,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 @@ -2016,7 +2016,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2169,7 +2169,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3 ; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 @@ -2193,8 +2193,8 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2372,7 +2372,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2550,7 +2550,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2733,7 +2733,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem) ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index 366dad1612b437ad96f6e08b012df87d2b6f6b81..5f6337e29d6852b9d51d335468cafc74e21eaaaf 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -288,7 +288,7 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 @@ -315,7 +315,7 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 @@ -365,7 +365,7 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 @@ -392,7 +392,7 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 @@ -445,7 +445,7 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 @@ -473,7 +473,7 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 @@ -526,7 +526,7 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 @@ -554,7 +554,7 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 @@ -608,7 +608,7 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 @@ -637,7 +637,7 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 @@ -700,7 +700,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin ; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -730,7 +730,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -784,7 +784,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw ; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -814,7 +814,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -872,7 +872,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 @@ -904,7 +904,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 @@ -962,7 +962,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -994,7 +994,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -1053,7 +1053,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 @@ -1086,7 +1086,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5)) ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index e3d2ac659d43865411c735499db46bff403d7552..8289e885618f7b07c587c2ef7c03c1e41456f002 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -918,13 +918,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm5, %ymm1 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) ; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm3, %ymm0 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) ; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-AVX512-NEXT: vzeroupper @@ -985,7 +985,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vpternlogq $248, %zmm2, %zmm3, %zmm0 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) ; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq @@ -1993,21 +1993,21 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-SKX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-SKX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-SKX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; CHECK-SKX-NEXT: retq ; ; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; CHECK-AVX512-NEXT: retq ; ; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8: ; CHECK-VBMI1: # %bb.0: ; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-VBMI1-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-VBMI1-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; CHECK-VBMI1-NEXT: retq ; ; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8: @@ -2025,7 +2025,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-SKX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-SKX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-SKX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; CHECK-SKX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-SKX-NEXT: retq ; @@ -2033,7 +2033,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: retq ; @@ -2041,7 +2041,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le ; CHECK-VBMI1: # %bb.0: ; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-VBMI1-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-VBMI1-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; CHECK-VBMI1-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-VBMI1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/X86/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..37756009fa7d8689c15f602d398ef0c395baa65a --- /dev/null +++ b/llvm/test/CodeGen/X86/naked-fn-with-frame-pointer.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple i386 | FileCheck %s -check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple x86_64 | FileCheck %s -check-prefixes=CHECK-64 + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-32-LABEL: naked: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: calll main +; +; CHECK-64-LABEL: naked: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: callq main + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-32-LABEL: normal: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: pushl %ebp +; CHECK-32-NEXT: .cfi_def_cfa_offset 8 +; CHECK-32-NEXT: .cfi_offset %ebp, -8 +; CHECK-32-NEXT: movl %esp, %ebp +; CHECK-32-NEXT: .cfi_def_cfa_register %ebp +; CHECK-32-NEXT: calll main +; +; CHECK-64-LABEL: normal: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: pushq %rbp +; CHECK-64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-64-NEXT: .cfi_offset %rbp, -16 +; CHECK-64-NEXT: movq %rsp, %rbp +; CHECK-64-NEXT: .cfi_def_cfa_register %rbp +; CHECK-64-NEXT: callq main + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 2b475644a38cf6846c0cd75667226a187521d59b..6c3d04863118cf249ed27adfccd4ad9ee08fbe81 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -833,7 +833,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8c: @@ -841,7 +841,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) ; AVX512BW-NEXT: retq entry: %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > @@ -978,7 +978,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $248, %zmm3, %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm3) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: @@ -989,7 +989,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $248, %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) ; AVX512BW-NEXT: retq entry: %A = mul <64 x i8> %i, %j diff --git a/llvm/test/CodeGen/X86/pr108731.ll b/llvm/test/CodeGen/X86/pr108731.ll index 87dce0314d08e2a1638fa588edce7896a40ff851..473b4f7f4da2e34a4380ea6a324f4e767508c982 100644 --- a/llvm/test/CodeGen/X86/pr108731.ll +++ b/llvm/test/CodeGen/X86/pr108731.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,NOBMI ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,BMI -define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) { -; NOBMI-LABEL: foo: +define i64 @test_i64(i64 %w, i64 %x, i64 %y, i64 %z) { +; NOBMI-LABEL: test_i64: ; NOBMI: # %bb.0: # %Entry ; NOBMI-NEXT: movq %rcx, %rax ; NOBMI-NEXT: andq %rdx, %rsi @@ -14,7 +14,7 @@ define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) { ; NOBMI-NEXT: andq %rsi, %rax ; NOBMI-NEXT: retq ; -; BMI-LABEL: foo: +; BMI-LABEL: test_i64: ; BMI: # %bb.0: # %Entry ; BMI-NEXT: andq %rdx, %rsi ; BMI-NEXT: andnq %rdi, %rsi, %rax @@ -31,8 +31,91 @@ Entry: ret i64 %and3 } -define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) { -; NOBMI-LABEL: fooVec: +define i32 @test_i32(i32 %w, i32 %x, i32 %y, i32 %z) { +; NOBMI-LABEL: test_i32: +; NOBMI: # %bb.0: # %Entry +; NOBMI-NEXT: movl %ecx, %eax +; NOBMI-NEXT: andl %edx, %esi +; NOBMI-NEXT: notl %esi +; NOBMI-NEXT: andl %edi, %esi +; NOBMI-NEXT: notl %eax +; NOBMI-NEXT: orl %edx, %eax +; NOBMI-NEXT: andl %esi, %eax +; NOBMI-NEXT: retq +; +; BMI-LABEL: test_i32: +; BMI: # %bb.0: # %Entry +; BMI-NEXT: andl %edx, %esi +; BMI-NEXT: andnl %edi, %esi, %eax +; BMI-NEXT: andnl %ecx, %edx, %ecx +; BMI-NEXT: andnl %eax, %ecx, %eax +; BMI-NEXT: retq +Entry: + %and1 = and i32 %y, %x + %xor1 = xor i32 %and1, -1 + %and2 = and i32 %xor1, %w + %.not = xor i32 %z, -1 + %or1 = or i32 %.not, %y + %and3 = and i32 %and2, %or1 + ret i32 %and3 +} + +define i16 @test_i16(i16 %w, i16 %x, i16 %y, i16 %z) { +; NOBMI-LABEL: test_i16: +; NOBMI: # %bb.0: # %Entry +; NOBMI-NEXT: movl %ecx, %eax +; NOBMI-NEXT: andl %edx, %esi +; NOBMI-NEXT: notl %esi +; NOBMI-NEXT: andl %edi, %esi +; NOBMI-NEXT: notl %eax +; NOBMI-NEXT: orl %edx, %eax +; NOBMI-NEXT: andl %esi, %eax +; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax +; NOBMI-NEXT: retq +; +; BMI-LABEL: test_i16: +; BMI: # %bb.0: # %Entry +; BMI-NEXT: andl %edx, %esi +; BMI-NEXT: andnl %edi, %esi, %eax +; BMI-NEXT: notl %ecx +; BMI-NEXT: orl %edx, %ecx +; BMI-NEXT: andl %ecx, %eax +; BMI-NEXT: # kill: def $ax killed $ax killed $eax +; BMI-NEXT: retq +Entry: + %and1 = and i16 %y, %x + %xor1 = xor i16 %and1, -1 + %and2 = and i16 %xor1, %w + %.not = xor i16 %z, -1 + %or1 = or i16 %.not, %y + %and3 = and i16 %and2, %or1 + ret i16 %and3 +} + +define i8 @test_i8(i8 %w, i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: test_i8: +; CHECK: # %bb.0: # %Entry +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: andl %edx, %esi +; CHECK-NEXT: notb %sil +; CHECK-NEXT: andb %dil, %sil +; CHECK-NEXT: notb %cl +; CHECK-NEXT: orb %cl, %al +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +Entry: + %and1 = and i8 %y, %x + %xor1 = xor i8 %and1, -1 + %and2 = and i8 %xor1, %w + %.not = xor i8 %z, -1 + %or1 = or i8 %.not, %y + %and3 = and i8 %and2, %or1 + ret i8 %and3 +} + +define <16 x i8> @test_v16i8(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) { +; NOBMI-LABEL: test_v16i8: ; NOBMI: # %bb.0: # %Entry ; NOBMI-NEXT: andps %xmm2, %xmm1 ; NOBMI-NEXT: andnps %xmm0, %xmm1 @@ -41,7 +124,7 @@ define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) ; NOBMI-NEXT: movaps %xmm2, %xmm0 ; NOBMI-NEXT: retq ; -; BMI-LABEL: fooVec: +; BMI-LABEL: test_v16i8: ; BMI: # %bb.0: # %Entry ; BMI-NEXT: vandps %xmm1, %xmm2, %xmm1 ; BMI-NEXT: vandnps %xmm0, %xmm1, %xmm0 @@ -58,6 +141,38 @@ Entry: ret <16 x i8> %and3 } +define <32 x i8> @test_v32i8(<32 x i8> %w, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) { +; NOBMI-LABEL: test_v32i8: +; NOBMI: # %bb.0: # %Entry +; NOBMI-NEXT: andps %xmm4, %xmm2 +; NOBMI-NEXT: andps %xmm5, %xmm3 +; NOBMI-NEXT: andnps %xmm1, %xmm3 +; NOBMI-NEXT: andnps %xmm0, %xmm2 +; NOBMI-NEXT: andnps %xmm6, %xmm4 +; NOBMI-NEXT: andnps %xmm2, %xmm4 +; NOBMI-NEXT: andnps %xmm7, %xmm5 +; NOBMI-NEXT: andnps %xmm3, %xmm5 +; NOBMI-NEXT: movaps %xmm4, %xmm0 +; NOBMI-NEXT: movaps %xmm5, %xmm1 +; NOBMI-NEXT: retq +; +; BMI-LABEL: test_v32i8: +; BMI: # %bb.0: # %Entry +; BMI-NEXT: vandps %ymm1, %ymm2, %ymm1 +; BMI-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; BMI-NEXT: vandnps %ymm3, %ymm2, %ymm1 +; BMI-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; BMI-NEXT: retq +Entry: + %and1 = and <32 x i8> %y, %x + %xor1 = xor <32 x i8> %and1, + %and2 = and <32 x i8> %xor1, %w + %.not = xor <32 x i8> %z, + %or1 = or <32 x i8> %.not, %y + %and3 = and <32 x i8> %and2, %or1 + ret <32 x i8> %and3 +} + ; PR112347 - don't fold if we'd be inverting a constant, as demorgan normalisation will invert it back again. define void @PR112347(ptr %p0, ptr %p1, ptr %p2) { ; CHECK-LABEL: PR112347: diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index cca9d270fd498f810f4e3e96264c53261fc6fdbf..ad08eaffab383b67b491c5f0c9f03e8b32f4d115 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -62,7 +62,7 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) { ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -74,7 +74,7 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) { ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -109,7 +109,7 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -120,7 +120,7 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: retq %in = load <8 x i32>, ptr %p @@ -242,7 +242,7 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -254,7 +254,7 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index ffc83620d3dad278ae9559444bff1048b95a2e8c..3699c7f75c86192846c372d2d79fb62315f7dd31 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -47,12 +47,12 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -78,8 +78,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2 -; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512VLBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512VLBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VLBW-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -93,12 +93,12 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -109,8 +109,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vptestnmd %zmm1, %zmm1, %k2 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -152,7 +152,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 ; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX256VL-NEXT: vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX256VL-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm1 ; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1 ; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 @@ -179,12 +179,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512NOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512NOBW-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512NOBW-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512NOBW-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512NOBW-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512NOBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512NOBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512NOBW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512NOBW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512NOBW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512NOBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512NOBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll index e3a608abfda4332c18214e9c74e646ff8acf74af..155ef0faadad84fc024abfcbd7b70183eadbddac 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll @@ -17,7 +17,7 @@ define <16 x i1> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX256-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 ; AVX256-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX256-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1 +; AVX256-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX256-NEXT: vptestmd %ymm2, %ymm2, %k1 @@ -46,7 +46,7 @@ define <16 x i1> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm2, %k1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vpmovdb %zmm0, (%rdi) -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -69,7 +69,7 @@ define <16 x i1> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX256-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX256-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX256-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1 +; AVX256-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX256-NEXT: vptestmd %ymm2, %ymm2, %k1 @@ -98,7 +98,7 @@ define <16 x i1> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vpmovdb %zmm0, (%rdi) -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index a950a13b0d8ca006d14251fc02b1bfce1f41be53..c9bb3de92dcda66962ddd59492a9c3de6abaaefd 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -64,7 +64,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX256BW-NEXT: vpternlogq $248, %ymm2, %ymm3, %ymm0 +; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm2) ; AVX256BW-NEXT: retq ; ; AVX512BWVL-LABEL: test_mul_32i8: diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 4d220c43dc478f720c3cafa858735d6959fd6d1b..be8adf697d5c1b0a9b3dd712f841ee2b216372b4 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -146,7 +146,7 @@ define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind { ; AVX512-LABEL: ashr_xor_and_custom: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem) ; AVX512-NEXT: retq %signsplat = ashr <4 x i32> %x, %flipsign = xor <4 x i32> %x, @@ -187,7 +187,7 @@ define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind { ; AVX512-LABEL: ashr_add_and_custom: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem) ; AVX512-NEXT: retq %signsplat = ashr <4 x i32> %x, %flipsign = add <4 x i32> %x, @@ -230,7 +230,7 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind { ; AVX512-LABEL: usubsat_custom: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem) ; AVX512-NEXT: retq %res = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> ) ret <4 x i32> %res diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index f78b57d895ee1898e1c22893329f2b1e92517f68..949902a5ebc47cad1da25fa672b292c412d872b3 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -820,7 +820,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_min(<16 x i8> %x, <16 x i8> ; AVX512-LABEL: unsigned_sat_variable_v16i8_using_min: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -875,10 +875,10 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16 ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 | (xmm0 ^ xmm2) ; AVX512-NEXT: retq %noty = xor <16 x i8> %y, %a = add <16 x i8> %x, %y @@ -917,7 +917,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16> ; AVX512-LABEL: unsigned_sat_variable_v8i16_using_min: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -982,10 +982,10 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8 ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 | (xmm0 ^ xmm2) ; AVX512-NEXT: retq %noty = xor <8 x i16> %y, %a = add <8 x i16> %x, %y @@ -1029,7 +1029,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32> ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_min: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1070,7 +1070,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1117,7 +1117,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4 ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm3 = ~xmm3 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpcmpleud %xmm3, %xmm0, %k1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2 {%k1} @@ -1202,7 +1202,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64> ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_min: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1272,7 +1272,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2 ; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1339,7 +1339,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2 ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm3 = ~xmm3 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpcmpleuq %xmm3, %xmm0, %k1 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm2 {%k1} diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 220c2e5012ea49acdbc72e075a315ce85bf88df1..a2bcadd104a7b2838a751942f8df3bd9d020d5ec 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2465,7 +2465,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0] ; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221] ; CHECK-AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3 -; CHECK-AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; CHECK-AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem) ; CHECK-AVX512VL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,128,1,1,1,128,1,64,128,1,128,1,128,32,1,1] @@ -2483,7 +2483,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX512VL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 ; CHECK-AVX512VL-NEXT: vpandn %ymm0, %ymm3, %ymm3 ; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0 -; CHECK-AVX512VL-NEXT: vpternlogq $14, %ymm3, %ymm2, %ymm0 +; CHECK-AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 & (ymm2 | ymm3) ; CHECK-AVX512VL-NEXT: retq %rem = srem <32 x i8> %x, %cmp = icmp ne <32 x i8> %rem, zeroinitializer diff --git a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll new file mode 100644 index 0000000000000000000000000000000000000000..0385017a1ced733c2a9bf059426d8bd6979dc063 --- /dev/null +++ b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=-sse,-avx | FileCheck %s + +@.str = private unnamed_addr constant [6 x i8] c"%d %d\00", align 1 + +define void @caller(i32 %0, i32 %1) #0 { +; CHECK-LABEL: caller: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %r11 +; CHECK-NEXT: pushq %r10 +; CHECK-NEXT: pushq %r9 +; CHECK-NEXT: pushq %r8 +; CHECK-NEXT: pushq %rdx +; CHECK-NEXT: pushq %rcx +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movl %edi, %esi +; CHECK-NEXT: movl $.L.str, %edi +; CHECK-NEXT: callq printf@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: popq %rdx +; CHECK-NEXT: popq %r8 +; CHECK-NEXT: popq %r9 +; CHECK-NEXT: popq %r10 +; CHECK-NEXT: popq %r11 +; CHECK-NEXT: retq + %3 = tail call i32 @printf(ptr @.str, i32 %0, i32 %1) + ret void +} + +declare i32 @printf(ptr, ...) nounwind + +attributes #0 = { mustprogress nounwind "no_caller_saved_registers" } diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index b486014678466ea76879010dda4e9e97179d5848..21dfdc3c2abe49d1ba51976334ec92684ac307f2 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -8672,6 +8672,263 @@ entry: ret <4 x double> %atan } +define <1 x float> @constrained_vector_atan2_v1f32() #0 { +; CHECK-LABEL: constrained_vector_atan2_v1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq atan2f@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_atan2_v1f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq atan2f@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %atan2 = call <1 x float> @llvm.experimental.constrained.atan2.v1f32( + <1 x float> , + <1 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %atan2 +} + +define <2 x double> @constrained_vector_atan2_v2f64() #0 { +; CHECK-LABEL: constrained_vector_atan2_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_atan2_v2f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %atan2 = call <2 x double> @llvm.experimental.constrained.atan2.v2f64( + <2 x double> , + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %atan2 +} + +define <3 x float> @constrained_vector_atan2_v3f32() #0 { +; CHECK-LABEL: constrained_vector_atan2_v3f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.5E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq atan2f@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq atan2f@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.4E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq atan2f@PLT +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_atan2_v3f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.5E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq atan2f@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq atan2f@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = [2.4E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq atan2f@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %atan2 = call <3 x float> @llvm.experimental.constrained.atan2.v3f32( + <3 x float> , + <3 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %atan2 +} + +define <3 x double> @constrained_vector_atan2_v3f64() #0 { +; CHECK-LABEL: constrained_vector_atan2_v3f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) +; CHECK-NEXT: wait +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_atan2_v3f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %atan2 = call <3 x double> @llvm.experimental.constrained.atan2.v3f64( + <3 x double> , + <3 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %atan2 +} + +define <4 x double> @constrained_vector_atan2_v4f64() #0 { +; CHECK-LABEL: constrained_vector_atan2_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3300000000000001E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0] +; CHECK-NEXT: callq atan2@PLT +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_atan2_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3300000000000001E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0] +; AVX-NEXT: callq atan2@PLT +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %atan2 = call <4 x double> @llvm.experimental.constrained.atan2.v4f64( + <4 x double> , + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %atan2 +} + define <1 x float> @constrained_vector_cosh_v1f32() #0 { ; CHECK-LABEL: constrained_vector_cosh_v1f32: ; CHECK: # %bb.0: # %entry @@ -9546,6 +9803,7 @@ declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, meta declare <4 x double> @llvm.experimental.constrained.asin.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.acos.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.atan.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.atan2.v4f64(<4 x double>, <4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.sinh.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.cosh.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.tanh.v4f64(<4 x double>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index bbc87eda82a5da22d7cc6c179ad655d0c431fff3..b4375cfb343b7bea4becbaa9aeff1db931419ae6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -55,10 +55,10 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; X86-LABEL: combine_pshufb_identity_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X86-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X86-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 +; X86-NEXT: vpternlogd {{.*#+}} zmm3 = -1 ; X86-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; X86-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -66,11 +66,11 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; ; X64-LABEL: combine_pshufb_identity_mask: ; X64: # %bb.0: -; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; X64-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; X64-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-NEXT: kmovq %rdi, %k1 -; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 +; X64-NEXT: vpternlogd {{.*#+}} zmm3 = -1 ; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; X64-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} ; X64-NEXT: vmovdqa64 %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/XCore/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/XCore/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..429a78108a7bacc088dbda1a8d23992f7335bd8f --- /dev/null +++ b/llvm/test/CodeGen/XCore/naked-fn-with-frame-pointer.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march xcore | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: # %bb.0: +; CHECK-NEXT: bl main +; CHECK-NEXT: .cc_bottom naked.function + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: # %bb.0: +; CHECK-NEXT: entsp 2 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset 15, 0 +; CHECK-NEXT: stw r10, sp[1] # 4-byte Folded Spill +; CHECK-NEXT: .cfi_offset 10, -4 +; CHECK-NEXT: ldaw r10, sp[0] +; CHECK-NEXT: .cfi_def_cfa_register 10 +; CHECK-NEXT: extsp 1 +; CHECK-NEXT: bl main +; CHECK-NEXT: ldaw sp, sp[1] +; CHECK-NEXT: .cc_bottom normal.function + call void @main() + unreachable +} diff --git a/llvm/test/CodeGen/Xtensa/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Xtensa/naked-fn-with-frame-pointer.ll new file mode 100644 index 0000000000000000000000000000000000000000..020fcc4f6dae6db547a78408546a22d0d8aada36 --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/naked-fn-with-frame-pointer.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march xtensa | FileCheck %s -check-prefixes=CHECK + +declare dso_local void @main() + +define dso_local void @naked() naked "frame-pointer"="all" { +; CHECK-LABEL: naked: +; CHECK: # %bb.0: +; CHECK-NEXT: l32r a8, {{\.?LCPI[0-9]+_[0-9]+}} +; CHECK-NEXT: callx0 a8 + call void @main() + unreachable +} + +define dso_local void @normal() "frame-pointer"="all" { +; CHECK-LABEL: normal: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a8, a1, -16 +; CHECK-NEXT: or a1, a8, a8 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: s32i a0, a1, 4 # 4-byte Folded Spill +; CHECK-NEXT: s32i a15, a1, 0 # 4-byte Folded Spill +; CHECK-NEXT: .cfi_offset a0, -4 +; CHECK-NEXT: .cfi_offset a15, -8 +; CHECK-NEXT: or a15, a1, a1 +; CHECK-NEXT: .cfi_def_cfa_register a15 +; CHECK-NEXT: l32r a8, {{\.?LCPI[0-9]+_[0-9]+}} +; CHECK-NEXT: callx0 a8 + call void @main() + unreachable +} diff --git a/llvm/test/Demangle/ms-placeholder-return-type.test b/llvm/test/Demangle/ms-placeholder-return-type.test index 18038e636c8d5a46302453b089185f50e2fda545..a656400fe140fb577d95a5eb025e231f212cb290 100644 --- a/llvm/test/Demangle/ms-placeholder-return-type.test +++ b/llvm/test/Demangle/ms-placeholder-return-type.test @@ -1,18 +1,18 @@ -; RUN: llvm-undname < %s | FileCheck %s - -; CHECK-NOT: Invalid mangled name - -?TestNonTemplateAuto@@YA@XZ -; CHECK: __cdecl TestNonTemplateAuto(void) - -??$AutoT@X@@YA?A_PXZ -; CHECK: auto __cdecl AutoT(void) - -??$AutoT@X@@YA?B_PXZ -; CHECK: auto const __cdecl AutoT(void) - -??$AutoT@X@@YA?A_TXZ -; CHECK: decltype(auto) __cdecl AutoT(void) - -??$AutoT@X@@YA?B_TXZ -; CHECK: decltype(auto) const __cdecl AutoT(void) +; RUN: llvm-undname < %s | FileCheck %s + +; CHECK-NOT: Invalid mangled name + +?TestNonTemplateAuto@@YA@XZ +; CHECK: __cdecl TestNonTemplateAuto(void) + +??$AutoT@X@@YA?A_PXZ +; CHECK: auto __cdecl AutoT(void) + +??$AutoT@X@@YA?B_PXZ +; CHECK: auto const __cdecl AutoT(void) + +??$AutoT@X@@YA?A_TXZ +; CHECK: decltype(auto) __cdecl AutoT(void) + +??$AutoT@X@@YA?B_TXZ +; CHECK: decltype(auto) const __cdecl AutoT(void) diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s index 47142c4be3c09ee09b7c6e575565c7b971ef7649..092f7d753c7eae66ef0b7d6f2287b1a65deb9600 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s @@ -22,4 +22,12 @@ main: .type bar,@function bar: retw $external_data - .size bar, .-bar \ No newline at end of file + .size bar, .-bar + +# jitlink-check: decode_operand(baz, 0) = external_data + 23 + .globl baz + .align 2, 0x90 + .type baz,@function +baz: + retw $external_data+23 + .size baz, .-baz diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s index e4b02a794bbc4a01fa9da4be0434be49b25c5bb8..a66ad8e7cda6773044d5999431e7ff82f3782e9c 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s @@ -7,17 +7,25 @@ # Test ELF 32 bit absolute relocations .text - .globl main + .globl main .p2align 4, 0x90 .type main,@function -main: +main: retl .size main, .-main # jitlink-check: decode_operand(foo, 0) = external_data - .globl foo + .globl foo .p2align 4, 0x90 .type foo,@function foo: movl external_data, %eax - .size foo, .-foo \ No newline at end of file + .size foo, .-foo + +# jitlink-check: decode_operand(bar, 0) = external_data + 4000 + .globl bar + .p2align 4, 0x90 + .type bar,@function +bar: + movl external_data + 4000, %eax + .size bar, .-bar diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s index df74c7bb39324a858edbef607a66c938303246fb..0717c8f434d537100d78c3748abfe4d87e8a248e 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s @@ -33,11 +33,12 @@ foo: # Tests PC relative relocation for negative offset from PC -# jitlink-check: decode_operand(baz, 0) = fooz - next_pc(baz) +# jitlink-check: decode_operand(baz, 0) = fooz - next_pc(baz) + 1 .globl fooz .p2align 4 .type fooz,@function fooz: + nop retl .size fooz, .-fooz @@ -45,5 +46,5 @@ fooz: .p2align 4 .type baz,@function baz: - calll fooz - .size baz, .-baz \ No newline at end of file + calll fooz+1 + .size baz, .-baz diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s index 91049a8a87a551845e02a4dc6051cc11df77d2b7..080341ac3bfede54fa5a2267f4609120cd38b82a 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s @@ -19,29 +19,29 @@ main: # Test GOT32 handling. # # We want to check both the offset to the GOT entry and its contents. -# jitlink-check: decode_operand(test_got, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data1) - _GLOBAL_OFFSET_TABLE_ +# jitlink-check: decode_operand(test_got, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data1) - _GLOBAL_OFFSET_TABLE_ + 42 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_got.o, named_data1)) = named_data1 # -# jitlink-check: decode_operand(test_got+6, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data2) - _GLOBAL_OFFSET_TABLE_ +# jitlink-check: decode_operand(test_got+6, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data2) - _GLOBAL_OFFSET_TABLE_ + 5 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_got.o, named_data2)) = named_data2 .globl test_got .p2align 4, 0x90 .type test_got,@function test_got: - leal named_data1@GOT, %eax - leal named_data2@GOT, %eax + leal named_data1@GOT+42, %eax + leal named_data2@GOT+5, %eax .size test_got, .-test_got # Test GOTOFF64 handling. -# jitlink-check: decode_operand(test_gotoff, 1) = named_func - _GLOBAL_OFFSET_TABLE_ +# jitlink-check: decode_operand(test_gotoff, 1) = named_func - _GLOBAL_OFFSET_TABLE_ + 99 .globl test_gotoff .p2align 4, 0x90 .type test_gotoff,@function test_gotoff: - mov $named_func@GOTOFF, %eax + mov $named_func@GOTOFF+99, %eax .size test_gotoff, .-test_gotoff diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s index e5725a2b52c30de8b49a247eab6512203b7465a8..ce565ca2fcdda7236d7063702aefcd83ec5542c7 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s @@ -27,12 +27,12 @@ main: # for position independent code, first, as there may be future use-cases # where we would want to disable the optimization. # -# jitlink-check: decode_operand(test_call_extern_plt, 0) = external_func - next_pc(test_call_extern_plt) +# jitlink-check: decode_operand(test_call_extern_plt, 0) = external_func - next_pc(test_call_extern_plt) + 53 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_plt.o, external_func))= external_func .globl test_call_extern_plt .p2align 4, 0x90 .type test_call_extern_plt,@function test_call_extern_plt: - call external_func@plt + call external_func@plt + 53 - .size test_call_extern_plt, .-test_call_extern_plt \ No newline at end of file + .size test_call_extern_plt, .-test_call_extern_plt diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_SIZE.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_SIZE.s new file mode 100644 index 0000000000000000000000000000000000000000..abde122f76e23703495bc0691f50c1a60a469cef --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_SIZE.s @@ -0,0 +1,27 @@ +# Checks that JITLink is able to handle R_X86_64_SIZE32/R_X86_64_SIZE64 relocations. +# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent \ +# RUN: -filetype=obj -o %t.1.o %s +# RUN: llvm-jitlink -noexec %t.1.o + +# Checks that JITLink emits an error message when the fixup cannot fit into a 32-bit value. +# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent --defsym=OVERFLOW=1 \ +# RUN: -filetype=obj -o %t.2.o %s +# RUN: not llvm-jitlink -noexec %t.2.o 2>&1 | FileCheck %s +# CHECK: llvm-jitlink error: In graph {{.*}}, section .text: relocation target "main" at address {{.*}} is out of range of Size32 fixup at {{.*}} (main, {{.*}}) + + .text + .globl main + .type main,@function +main: + xorl %eax, %eax + movq main@SIZE + 2, %rbx # Generate R_X86_64_SIZE32 relocation. +.ifndef OVERFLOW + movl main@SIZE + 1, %ebx # Generate R_X86_64_SIZE32 relocation. +.else + movl main@SIZE - 32, %ebx # Generate R_X86_64_SIZE32 relocation whose fixup overflows. +.endif + retq + .size main, .-main + + .data + .quad main@SIZE + 1 # Generate R_X86_64_SIZE64 relocation. diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll index 80f8b15abfaabecaea51d1f71862dd13d3a1fc3f..ada22c39abc9e719a2d283105bb6b65f97f154fa 100644 --- a/llvm/test/Feature/fp-intrinsics.ll +++ b/llvm/test/Feature/fp-intrinsics.ll @@ -184,7 +184,7 @@ entry: ret double %result } -; Verify that atan(42.0) isn't simplified when the rounding mode is unknown. +; Verify that atan(42.0, 23.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: fatan ; CHECK: call double @llvm.experimental.constrained.atan define double @fatan() #0 { @@ -195,6 +195,19 @@ entry: ret double %result } +; Verify that atan2(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: fatan2 +; CHECK: call double @llvm.experimental.constrained.atan2 +define double @fatan2() #0 { +entry: + %result = call double @llvm.experimental.constrained.atan2.f64( + double 42.0, + double 23.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that cosh(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: fcosh ; CHECK: call double @llvm.experimental.constrained.cosh diff --git a/llvm/test/FileCheck/.gitattributes b/llvm/test/FileCheck/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..ba27d7fad76d504bb5a6dde2ed156b680ba1d69d --- /dev/null +++ b/llvm/test/FileCheck/.gitattributes @@ -0,0 +1 @@ +dos-style-eol.txt text eol=crlf diff --git a/llvm/test/FileCheck/dos-style-eol.txt b/llvm/test/FileCheck/dos-style-eol.txt index 4252aad4d3e7bfd52bb4686c6cfdf79a8d8d5223..52184f465c3fdf98fe9c5a86a96e47d51c2b698d 100644 --- a/llvm/test/FileCheck/dos-style-eol.txt +++ b/llvm/test/FileCheck/dos-style-eol.txt @@ -1,11 +1,11 @@ -// Test for using FileCheck on DOS style end-of-line -// This test was deliberately committed with DOS style end of line. -// Don't change line endings! -// RUN: FileCheck -input-file %s %s -// RUN: FileCheck --strict-whitespace -input-file %s %s - -LINE 1 -; CHECK: {{^}}LINE 1{{$}} - -LINE 2 +// Test for using FileCheck on DOS style end-of-line +// This test was deliberately committed with DOS style end of line. +// Don't change line endings! +// RUN: FileCheck -input-file %s %s +// RUN: FileCheck --strict-whitespace -input-file %s %s + +LINE 1 +; CHECK: {{^}}LINE 1{{$}} + +LINE 2 ; CHECK: {{^}}LINE 2{{$}} \ No newline at end of file diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll index 9cf40fe06c1237682f60fdf357138bc3a4d7d114..a98df8b1d104a6116b3bb554a5bd627447e75f7e 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll @@ -1,9 +1,6 @@ -; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S \ -; RUN: | FileCheck %s --check-prefixes=CMP,NOSUB,ALL -; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-sub -S \ -; RUN: | FileCheck %s --check-prefixes=SUB,NOCMP,ALL -; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-pair -S \ -; RUN: | FileCheck %s --check-prefixes=CMP,SUB,ALL +; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S | FileCheck %s --check-prefixes=CMP,NOSUB,ALL +; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-sub -S | FileCheck %s --check-prefixes=SUB,NOCMP,ALL +; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-pair -S | FileCheck %s --check-prefixes=CMP,SUB,ALL ; Support instrumentation of invalid pointer pair detection. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll index e986f108542688770b4be6906283a97c6de5c07d..e9c1075a2cb94b9a208b8b495f8f78dbd01eeb15 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll @@ -1,10 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; Test appropriate tagging of funclet for function calls generated by asan. -; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 \ -; RUN: -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INLINE -; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 -asan-instrumentation-with-call-threshold=0 \ -; RUN: -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-OUTLINE +; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INLINE +; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 -asan-instrumentation-with-call-threshold=0 -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-OUTLINE ; REQUIRES: x86-registered-target diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll index 9a641287960f8e41f069c37f24ffc39c5c6395e9..597b3bb855b42b8114921b70d9a68a354e535545 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S \ -; RUN: | FileCheck %s -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S \ -; RUN: | FileCheck %s -check-prefix=DISABLED +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S | FileCheck %s -check-prefix=DISABLED ; Support ASan instrumentation for constant-mask llvm.masked.{load,store} diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll index ea028066e5857d69ca0a7961bbf760f2cdf15bee..c4ea78a0c91a7d717981ab9576a3429479786fd8 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll @@ -1,8 +1,5 @@ -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 \ -; RUN: -asan-optimize-callbacks -S | FileCheck %s --check-prefixes=LOAD,STORE -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 \ -; RUN: -asan-optimize-callbacks --asan-kernel -S | \ -; RUN: FileCheck %s --check-prefixes=LOAD-KERNEL,STORE-KERNEL +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-optimize-callbacks -S | FileCheck %s --check-prefixes=LOAD,STORE +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-optimize-callbacks --asan-kernel -S | FileCheck %s --check-prefixes=LOAD-KERNEL,STORE-KERNEL target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll index ee752c8c61da080a311efccea6a2767197522450..d22671aa84f86a82f640fff3e3a136b2524cec13 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S \ -; RUN: | FileCheck %s -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S \ -; RUN: | FileCheck %s -check-prefix=DISABLED +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S | FileCheck %s -check-prefix=DISABLED ; Support ASan instrumentation for constant-mask llvm.vp.{load,store} diff --git a/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll b/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll index 5e09f74005888cb9a86b0a1dcf8d70c4cde17e86..c09f7a513072e1f0c957dcda9d79d9939e1ef9f8 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll @@ -1,22 +1,10 @@ -; RUN: opt < %s -passes=asan -S \ -; RUN: -mtriple=i386-unknown-freebsd \ -; RUN: -data-layout="e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" | \ -; RUN: FileCheck --check-prefix=CHECK-32 %s - -; RUN: opt < %s -passes=asan -S \ -; RUN: -mtriple=x86_64-unknown-freebsd \ -; RUN: -data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | \ -; RUN: FileCheck --check-prefix=CHECK-64 %s - -; RUN: opt < %s -passes=asan -S \ -; RUN: -mtriple=aarch64-unknown-freebsd \ -; RUN: -data-layout="e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" | \ -; RUN: FileCheck --check-prefix=CHECK-AARCH64 %s - -; RUN: opt < %s -passes=asan -S \ -; RUN: -mtriple=mips64-unknown-freebsd \ -; RUN: -data-layout="E-m:e-i64:64-n32:64-S128" | \ -; RUN: FileCheck --check-prefix=CHECK-MIPS64 %s +; RUN: opt < %s -passes=asan -S -mtriple=i386-unknown-freebsd -data-layout="e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" | FileCheck --check-prefix=CHECK-32 %s + +; RUN: opt < %s -passes=asan -S -mtriple=x86_64-unknown-freebsd -data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | FileCheck --check-prefix=CHECK-64 %s + +; RUN: opt < %s -passes=asan -S -mtriple=aarch64-unknown-freebsd -data-layout="e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" | FileCheck --check-prefix=CHECK-AARCH64 %s + +; RUN: opt < %s -passes=asan -S -mtriple=mips64-unknown-freebsd -data-layout="E-m:e-i64:64-n32:64-S128" | FileCheck --check-prefix=CHECK-MIPS64 %s define i32 @read_4_bytes(ptr %a) sanitize_address { entry: diff --git a/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll b/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll index 79ed0b78fd6a4f0bf6c7992fbdaaedc8030aeb7d..abbc5bda1238c6888398fbfadeb8e8f365a9dc72 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll @@ -1,13 +1,10 @@ ; Check Default behaviour still emits ctors -; RUN: opt < %s -passes=asan -S | \ -; RUN: FileCheck -check-prefix=CHECK-DEFAULT %s +; RUN: opt < %s -passes=asan -S | FileCheck -check-prefix=CHECK-DEFAULT %s ; CHECK-DEFAULT: llvm.global_ctor{{.+}}asan.module_ctor ; CHECK-DEFAULT: define internal void @asan.module_ctor ; Check with ctor emission disabled -; RUN: opt < %s -passes=asan \ -; RUN: -asan-constructor-kind=none -S | \ -; RUN: FileCheck %s +; RUN: opt < %s -passes=asan -asan-constructor-kind=none -S | FileCheck %s ; CHECK-NOT: llvm.global_ctor{{.+}}asan.module_ctor ; CHECK-NOT: define internal void @asan.module_ctor diff --git a/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll b/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll index b927322e403a01f665c90da435dc7e9f6c17efc7..5204d80f1db529b157e76b201df6d9ddaa9b28a8 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll @@ -1,13 +1,10 @@ ; Check Default behaviour still emits dtors -; RUN: opt < %s -passes=asan -S | \ -; RUN: FileCheck -check-prefix=CHECK-DEFAULT %s +; RUN: opt < %s -passes=asan -S | FileCheck -check-prefix=CHECK-DEFAULT %s ; CHECK-DEFAULT: llvm.global_dtor{{.+}}asan.module_dtor ; CHECK-DEFAULT: define internal void @asan.module_dtor ; Check with dtor emission disabled -; RUN: opt < %s -passes=asan \ -; RUN: -asan-destructor-kind=none -S | \ -; RUN: FileCheck %s +; RUN: opt < %s -passes=asan -asan-destructor-kind=none -S | FileCheck %s ; CHECK-NOT: llvm.global_dtor{{.+}}asan.module_dtor ; CHECK-NOT: define internal void @asan.module_dtor diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll index cbb2001c45e651c3d29e5922554b0f080b5bd287..d56cd340d70abb4565a07f4527f9ec7e0863fba7 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll @@ -1,17 +1,7 @@ -; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 \ -; RUN: -asan-use-after-return=runtime -S | FileCheck %s \ -; RUN: --check-prefixes=CHECK,CHECK-RUNTIME -; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-mapping-scale=5 -asan-use-stack-safety=0 \ -; RUN: -asan-use-after-return=runtime -S | FileCheck %s \ -; RUN: --check-prefixes=CHECK,CHECK-RUNTIME -; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 \ -; RUN: -asan-use-after-return=always -S | FileCheck %s \ -; RUN: --check-prefixes=CHECK,CHECK-ALWAYS \ -; RUN: --implicit-check-not=__asan_option_detect_stack_use_after_return -; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 \ -; RUN: -asan-use-after-return=always -S | FileCheck %s \ -; RUN: --check-prefixes=CHECK,CHECK-ALWAYS \ -; RUN: --implicit-check-not=__asan_option_detect_stack_use_after_return +; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 -asan-use-after-return=runtime -S | FileCheck %s --check-prefixes=CHECK,CHECK-RUNTIME +; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-mapping-scale=5 -asan-use-stack-safety=0 -asan-use-after-return=runtime -S | FileCheck %s --check-prefixes=CHECK,CHECK-RUNTIME +; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 -asan-use-after-return=always -S | FileCheck %s --check-prefixes=CHECK,CHECK-ALWAYS --implicit-check-not=__asan_option_detect_stack_use_after_return +; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 -asan-use-after-return=always -S | FileCheck %s --check-prefixes=CHECK,CHECK-ALWAYS --implicit-check-not=__asan_option_detect_stack_use_after_return target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll b/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll index 48465be36789d6e4953e69da3520c5486ebf96ce..afbfbd6c7a96a81769400ca7fd8e99e3ab47ac00 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll @@ -1,9 +1,7 @@ ; Test the ASan's stack layout. ; More tests in tests/Transforms/Utils/ASanStackFrameLayoutTest.cpp -; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=0 -asan-use-after-scope -S \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-STATIC -; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=1 -asan-use-after-scope -S \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-DYNAMIC +; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=0 -asan-use-after-scope -S | FileCheck %s --check-prefixes=CHECK,CHECK-STATIC +; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=1 -asan-use-after-scope -S | FileCheck %s --check-prefixes=CHECK,CHECK-DYNAMIC target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll b/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll index 62bbc985cb1b557bae6427753c7f88585b1938ec..5953feb9a5982d6b5a1f8c869150345d4f4d869c 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll @@ -1,8 +1,7 @@ ; ASan shouldn't instrument code added by UBSan. ; RUN: opt < %s -passes=asan -S | FileCheck %s -; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S \ -; RUN: | FileCheck %s --check-prefixes=NOCMP +; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S | FileCheck %s --check-prefixes=NOCMP target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll index 833dc9641d8e2b4784c2b504c642f438d94f89c2..51bef51d6b35119cfa57420661197fb1c6a5d80f 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=100 -S \ -; RUN: | FileCheck %s -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S \ -; RUN: | FileCheck %s -check-prefix=CALLS +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=100 -S | FileCheck %s +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s -check-prefix=CALLS target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll b/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll index 5a9603757be31d3889cd5c179a9c6a5be0b768ff..f198fc01d2639d7677b8b34abcabbd564d414a31 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll @@ -1,19 +1,13 @@ ; Test -asan-with-ifunc flag. ; -; RUN: opt -passes=asan -S -asan-with-ifunc=0 < %s | \ -; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC -; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 < %s | \ -; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC -; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=1 < %s | \ -; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC-NOREMAT +; RUN: opt -passes=asan -S -asan-with-ifunc=0 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC +; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC +; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC-NOREMAT ; Pre-Lollipop Android does not support ifunc. -; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android20 < %s | \ -; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC -; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android < %s | \ -; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC -; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android21 < %s | \ -; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC +; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android20 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC +; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC +; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android21 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv7--linux-android22" diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll b/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll index 73fc077c9562424aa574a7577e719a9cfe12c3f5..1698592bafc62cf64d338d80529443c796c8f6bb 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll @@ -1,20 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --global-value-regex "![0-9]+" --version 2 ; Test -hwasan-with-ifunc flag. ; -; RUN: opt -passes=hwasan -S < %s | \ -; RUN: FileCheck %s -; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=instr < %s | \ -; RUN: FileCheck %s --check-prefixes=NOIFUNC-TLS-HISTORY -; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=none < %s | \ -; RUN: FileCheck %s --check-prefixes=NOIFUNC-TLS-NOHISTORY -; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=global -hwasan-with-frame-record=0 < %s | \ -; RUN: FileCheck %s --check-prefixes=NOIFUNC-NOTLS -; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=ifunc -hwasan-with-frame-record=0 < %s | \ -; RUN: FileCheck %s --check-prefixes=IFUNC-NOTLS -; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia < %s | \ -; RUN: FileCheck %s --check-prefixes=FUCHSIA -; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia -hwasan-record-stack-history=libcall < %s | \ -; RUN: FileCheck %s --check-prefixes=FUCHSIA-LIBCALL +; RUN: opt -passes=hwasan -S < %s | FileCheck %s +; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=instr < %s | FileCheck %s --check-prefixes=NOIFUNC-TLS-HISTORY +; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=none < %s | FileCheck %s --check-prefixes=NOIFUNC-TLS-NOHISTORY +; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=global -hwasan-with-frame-record=0 < %s | FileCheck %s --check-prefixes=NOIFUNC-NOTLS +; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=ifunc -hwasan-with-frame-record=0 < %s | FileCheck %s --check-prefixes=IFUNC-NOTLS +; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia < %s | FileCheck %s --check-prefixes=FUCHSIA +; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia -hwasan-record-stack-history=libcall < %s | FileCheck %s --check-prefixes=FUCHSIA-LIBCALL target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android22" diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll b/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll index 253e976a7d608b3a9341d51f9e8a5370c9109062..ae6fe5776f203b069ca1b4878c6e8deb5f750616 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll @@ -1,12 +1,8 @@ ; Test that storage for allocas with disjoint lifetimes is reused with ; use-after-scope. -; RUN: opt -S -passes=hwasan %s -hwasan-use-after-scope -o - | \ -; RUN: llc -no-stack-coloring=false -o - | \ -; RUN: FileCheck %s --check-prefix=COLOR -; RUN: opt -S -passes=hwasan -hwasan-use-after-scope %s -o - | \ -; RUN: llc -no-stack-coloring=true -o - | \ -; RUN: FileCheck %s --check-prefix=NOCOLOR +; RUN: opt -S -passes=hwasan %s -hwasan-use-after-scope -o - | llc -no-stack-coloring=false -o - | FileCheck %s --check-prefix=COLOR +; RUN: opt -S -passes=hwasan -hwasan-use-after-scope %s -o - | llc -no-stack-coloring=true -o - | FileCheck %s --check-prefix=NOCOLOR target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-android29" diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll index 643b2a8c274cf15669e13dd644eef66b379341c8..16e6cda59a616739e7bf0a86dd497bc7d8fb749e 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll @@ -1,26 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: aarch64-registered-target, x86-registered-target -; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan \ -; RUN: -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | \ -; RUN: FileCheck %s --check-prefixes=X86-SCOPE -; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan \ -; RUN: -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | \ -; RUN: FileCheck %s --check-prefixes=X86-NOSCOPE +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=X86-SCOPE +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=X86-NOSCOPE -; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \ -; RUN: -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | \ -; RUN: FileCheck %s --check-prefixes=AARCH64-SCOPE -; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \ -; RUN: -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | \ -; RUN: FileCheck %s --check-prefixes=AARCH64-NOSCOPE -; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \ -; RUN: -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls \ -; RUN: -hwasan-use-short-granules=1 -S < %s | \ -; RUN: FileCheck %s --check-prefixes=AARCH64-SHORT-SCOPE -; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \ -; RUN: -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls \ -; RUN: -hwasan-use-short-granules=1 -S < %s | \ -; RUN: FileCheck %s --check-prefixes=AARCH64-SHORT-NOSCOPE +; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=AARCH64-SCOPE +; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=AARCH64-NOSCOPE +; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -hwasan-use-short-granules=1 -S < %s | FileCheck %s --check-prefixes=AARCH64-SHORT-SCOPE +; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -hwasan-use-short-granules=1 -S < %s | FileCheck %s --check-prefixes=AARCH64-SHORT-NOSCOPE define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress { diff --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll index a0a309149e8f0dd058e6912c07ed46601bc367a9..80d6e0f3b36dff397fae66a26385d340bfdbe42c 100644 --- a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll +++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll @@ -1,11 +1,7 @@ -; RUN: opt < %s -passes=memprof -memprof-use-callbacks -S \ -; RUN: | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL -; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -S \ -; RUN: | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL -; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-writes=0 -S \ -; RUN: | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL -; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -memprof-instrument-writes=0 -S \ -; RUN: | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL +; RUN: opt < %s -passes=memprof -memprof-use-callbacks -S | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL +; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -S | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL +; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-writes=0 -S | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL +; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -memprof-instrument-writes=0 -S | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL ; Support memory profiling instrumentation for constant-mask llvm.masked.{load,store} target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll index dd2fecb081be234e077a1c950aa69eaf806c2a48..57a2599dbab8c37cd79eac5792bdf376ed5a7639 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll @@ -1,10 +1,6 @@ ; Test for the conservative assembly handling mode used by KMSAN. -; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 \ -; RUN: -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: "-check-prefix=CHECK" %s -; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 \ -; RUN: -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck \ -; RUN: "-check-prefixes=CHECK,CHECK-CONS" %s +; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck "-check-prefix=CHECK" %s +; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck "-check-prefixes=CHECK,CHECK-CONS" %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll index a83a94a06b98f75f0a9668ca2d030dd47168a162..a0e7b13e5c9ee5426ec44b8308db2cf4b2afd1a3 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll @@ -1,8 +1,5 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK \ -; RUN: -check-prefix=CHECK-ORIGINS %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s ; REQUIRES: x86-registered-target target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll index adb3e208d8553ec6749473c79ee52e5421ac27fe..9a7f4b985293c0a4ce50703ec4332feb94f783c5 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck %s ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are ; passed to a variadic function. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll index 32d43e11fbd9b8a5c40ba79de85e45bd52ef6928..7a3f0dd88f9c1e686de8998e3c71526e4dbd36f4 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll @@ -1,11 +1,7 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" -; RUN: opt < %s -msan-check-access-address=0 -S \ -; RUN: -passes="msan" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" +; RUN: opt < %s -msan-check-access-address=0 -S -passes="msan" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" ; Test that shadow and origin are stored for variadic function params. diff --git a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll index d9e4eeb7f7cbab7b99f0702dff9608d32ca1e048..236b019147036410800b812d676d041f7f6d1943 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll @@ -1,8 +1,5 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK \ -; RUN: %s --allow-empty +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK %s --allow-empty target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll index 327fec0ed702edea39022dc1c165513d9c075cde..2f60bd8b357b8605c333220854e65f6899ddc9f9 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s ; REQUIRES: x86-registered-target target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll b/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll index f83a92287d247b11d78d3c3f88f47f6ae08c9b85..e06576e2fead6523968c811932b67df7e74f4cde 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll @@ -1,7 +1,6 @@ ; Test that copy alignment for byval arguments is limited by param-tls slot alignment. -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/check-array.ll b/llvm/test/Instrumentation/MemorySanitizer/check-array.ll index a4e5e37eaaf867f4a2486b8e2a7162014335dbc6..3ffa0d254ab042e0e9869f23c7c012aaaa7e3f9d 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/check-array.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/check-array.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-eager-checks -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | \ -; RUN: FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s +; RUN: opt < %s -msan-eager-checks -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll b/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll index d24ffe32961959f32199418fbab8fb62b922482c..0be50758bd21ebec0e9eae3286091f10803fc75b 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | \ -; RUN: FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/clmul.ll b/llvm/test/Instrumentation/MemorySanitizer/clmul.ll index ae4f2d2c868a0d9c8bda84029e47640028e7d8b8..a498e1932d4bbebd8891658eccd4ace736b9f759 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/clmul.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/clmul.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s ; REQUIRES: x86-registered-target target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll index dd2fecb081be234e077a1c950aa69eaf806c2a48..57a2599dbab8c37cd79eac5792bdf376ed5a7639 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll @@ -1,10 +1,6 @@ ; Test for the conservative assembly handling mode used by KMSAN. -; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 \ -; RUN: -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: "-check-prefix=CHECK" %s -; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 \ -; RUN: -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck \ -; RUN: "-check-prefixes=CHECK,CHECK-CONS" %s +; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck "-check-prefix=CHECK" %s +; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck "-check-prefixes=CHECK,CHECK-CONS" %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll index a83a94a06b98f75f0a9668ca2d030dd47168a162..a0e7b13e5c9ee5426ec44b8308db2cf4b2afd1a3 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll @@ -1,8 +1,5 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK \ -; RUN: -check-prefix=CHECK-ORIGINS %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s ; REQUIRES: x86-registered-target target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll index adb3e208d8553ec6749473c79ee52e5421ac27fe..9a7f4b985293c0a4ce50703ec4332feb94f783c5 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck %s ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are ; passed to a variadic function. diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll index 32d43e11fbd9b8a5c40ba79de85e45bd52ef6928..7a3f0dd88f9c1e686de8998e3c71526e4dbd36f4 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll @@ -1,11 +1,7 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" -; RUN: opt < %s -msan-check-access-address=0 -S \ -; RUN: -passes="msan" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" +; RUN: opt < %s -msan-check-access-address=0 -S -passes="msan" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN" ; Test that shadow and origin are stored for variadic function params. diff --git a/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll b/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll index d7156540e003fe2ff11f175a672379dbf1942e19..b0069d2bae88e8002f257273ce1e865eaf7324f0 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll @@ -2,17 +2,9 @@ ; Test that in with-calls mode there are no calls to __msan_chain_origin - they ; are done from __msan_maybe_store_origin_*. -; RUN: opt < %s -msan-check-access-address=0 \ -; RUN: -msan-instrumentation-with-call-threshold=0 -S -passes=msan 2>&1 | \ -; RUN: FileCheck %s -; RUN: opt < %s -msan-check-access-address=0 \ -; RUN: -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK \ -; RUN: -check-prefix=CHECK-ORIGINS %s -; RUN: opt < %s -msan-check-access-address=0 \ -; RUN: -msan-instrumentation-with-call-threshold=0 -msan-track-origins=2 -S \ -; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK \ -; RUN: -check-prefix=CHECK-ORIGINS %s +; RUN: opt < %s -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s +; RUN: opt < %s -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll index 42c3656d6a7d674a2f915cbc38a40c257ed8cecf..4c7466c7b772bd6d997859a57182cdefbd33e30a 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll @@ -1,16 +1,10 @@ ; Test that the msan layout customization options work as expected ; -; RUN: opt < %s -msan-shadow-base 3735928559 -S -passes=msan 2>&1 | FileCheck \ -; RUN: --check-prefix=CHECK-BASE %s -; RUN: opt < %s -msan-shadow-base 3735928559 -msan-and-mask 4294901760 -S \ -; RUN: -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-AND %s -; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879 -S \ -; RUN: -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-XOR %s -; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879 \ -; RUN: -msan-and-mask 4294901760 -S -passes=msan 2>&1 | FileCheck \ -; RUN: --check-prefix=CHECK-XOR-AND %s -; RUN: opt < %s -msan-track-origins 1 -msan-origin-base 1777777 -S -passes=msan\ -; RUN: 2>&1 | FileCheck --check-prefix=CHECK-ORIGIN-BASE %s +; RUN: opt < %s -msan-shadow-base 3735928559 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-BASE %s +; RUN: opt < %s -msan-shadow-base 3735928559 -msan-and-mask 4294901760 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-AND %s +; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-XOR %s +; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879 -msan-and-mask 4294901760 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-XOR-AND %s +; RUN: opt < %s -msan-track-origins 1 -msan-origin-base 1777777 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-ORIGIN-BASE %s target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll b/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll index 1c3c3ddd3fcf323434c1f96fef709d04361db547..94e0f746b09317301da268a0a79e93d54e48a28b 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll index 2a643e12a4877f2a9ceb3414ce38163714a3408f..1b9ffaf9e505a552f78ed1638b78d7ea70a4c251 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll @@ -1,14 +1,8 @@ ; Test for handling of asm constraints in MSan instrumentation. -; RUN: opt < %s -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | \ -; RUN: FileCheck %s -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | \ -; RUN: FileCheck --check-prefixes=CHECK,USER-CONS %s -; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 \ -; RUN: -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: --check-prefixes=CHECK,KMSAN %s -; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 \ -; RUN: -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck \ -; RUN: --check-prefixes=CHECK,KMSAN,CHECK-CONS %s +; RUN: opt < %s -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck --check-prefixes=CHECK,USER-CONS %s +; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck --check-prefixes=CHECK,KMSAN %s +; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck --check-prefixes=CHECK,KMSAN,CHECK-CONS %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll index 30ab3280bdece962d8d47c41f6d7f3a7a96e0ee2..946c95b072ea9156cfc2160d64d594d575e3ed0a 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -msan-eager-checks -S -passes='module(msan)' 2>&1 | \ -; RUN: FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='msan' 2>&1 | \ -; RUN: FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -msan-eager-checks -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='msan' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll index 309921bfc352b6836b3fbddcb984287f106527eb..4b7a910af08bf67304880da042f0b5c17d4aeee9 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll @@ -1,6 +1,5 @@ ; KMSAN instrumentation tests -; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck %s \ -; RUN: -check-prefixes=CHECK +; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck %s -check-prefixes=CHECK target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll index 7ca92a1cecb2f3a2a1d269c8d1d15d1abad49841..b81ae888efc6a2f1cf477088cc75e6e653bc73df 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll @@ -1,7 +1,6 @@ ; Make sure MSan doesn't insert shadow checks for @llvm.is.constant.* arguments. -; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck \ -; RUN: -check-prefixes=CHECK %s +; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck -check-prefixes=CHECK %s ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck -check-prefixes=CHECK %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll b/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll index 50a4a1b17df277220c69858bab0a7319d145da44..89522a603b238c19492505fc56157f440ba34e8d 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll b/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll index 7a3ef669568242b8f09e37d316fd72d851c78b59..addfdf40f7e014b2fd6b6e3162a0aa6a6ceaa5fb 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll @@ -1,9 +1,5 @@ -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK \ -; RUN: -check-prefix=CHECK-ORIGINS1 %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S \ -; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK \ -; RUN: -check-prefix=CHECK-ORIGINS2 %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS1 %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS2 %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll b/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll index 228d686ad7f97560c7cd137b85b4f450963705fb..1f6d1bf0c20793769f5f39b388e3db1b7485ccee 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/reduce.ll b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll index 0c688811611b8d575ab797cc35577148baed003c..2a79b5e72bca5ed10c1a0db458941e8bccc7fd94 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/reduce.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll @@ -1,6 +1,5 @@ -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | \ -; RUN: FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll b/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll index 999085575f087804eee71b3bc2d0d3788021a883..bb6b9b62507d105152aea4748964fe58dc18ecdb 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll @@ -1,7 +1,5 @@ -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll b/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll index 43fec99d86d71abc4f0c6c229cd1759e120fc8bb..a91ac2654b41ff493acc821382654f9bd89136fb 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S \ -; RUN: -passes=msan 2>&1 | FileCheck %s +; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll index fe5cf9dcc65b20c5600f3aa649fbe158e1e4cdf6..c0d738145f28e01f43e97f26b57221ad1d538130 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s ; REQUIRES: x86-registered-target target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll index f4868294a3e7fdfd9a84eea6d1e3e4a83b2941a8..f4092c542d6b30b7d8162d5dea3c9bbd88884ae9 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s ; REQUIRES: x86-registered-target target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll index 13f7a1612de943310e90080a87e809e488352520..0f6f1fe4a7dcad298b558f8acb251a0361c802c6 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s ; REQUIRES: x86-registered-target target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll index 441dd8f64e2842e1d36109d35250752a6cc89831..461d6cb9217d8ff842adb076b13848d494ee27ea 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck \ -; RUN: %s +; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s ; REQUIRES: x86-registered-target ; Test instrumentation of vector shift instructions. diff --git a/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll b/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll index f42fa7139fd585b3d9134eb22d3e4caca9095be7..81848b69aa3bbf4ce34114f45a81b316131219ab 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll @@ -1,5 +1,4 @@ -; RUN: opt -passes='module(sancov-module)' -sanitizer-coverage-trace-pc \ -; RUN: -sanitizer-coverage-level=3 %s -S -o - | FileCheck %s +; RUN: opt -passes='module(sancov-module)' -sanitizer-coverage-trace-pc -sanitizer-coverage-level=3 %s -S -o - | FileCheck %s ; The edge between %entry and %for.inc.i is a critical edge. ; SanitizerCoveragePass must split this critical edge in order to track diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll index 4d11e1cb41e6af7fcc9702e621b94c01901f1b63..73a1ef64aa95945c75a7e587b9774d85ddbb0d2f 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll @@ -1,7 +1,6 @@ ; Ensure that we terminate with a useful error message (instead of crash) if the ; user declares `__sancov_lowest_stack` with an unexpected type. -; RUN: not opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 \ -; RUN: -sanitizer-coverage-stack-depth -S 2>&1 | FileCheck %s +; RUN: not opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -S 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/LTO/PowerPC/list-symbol.ll b/llvm/test/LTO/PowerPC/list-symbol.ll new file mode 100644 index 0000000000000000000000000000000000000000..75300b11f7f188e4d61f8846b29d997617f9fc64 --- /dev/null +++ b/llvm/test/LTO/PowerPC/list-symbol.ll @@ -0,0 +1,16 @@ +; RUN: llvm-as < %s > %t +; RUN: llvm-lto -list-symbols-only %t | FileCheck %s +; REQUIRES: default_triple + +; CHECK-DAG: v { data defined default } +; CHECK-DAG: va { data defined default alias } +; CHECK-DAG: f { function defined default } +; CHECK-DAG: fa { function defined default alias } + +@v = global i32 0 +@va = alias i32, ptr @v +@fa = alias void (ptr), ptr @f + +define void @f() { + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s index 67038f4c8eec09852441baec0d2d6a4df4c2068c..210d55898367d88074b07135c9f2ab260515bd64 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s @@ -244,49 +244,67 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1 v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null // GFX11: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] -v_add_nc_i16 v5, v1, v2 -// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] +v_add_nc_i16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] -v_add_nc_i16 v5, v255, v255 -// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] +v_add_nc_i16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] -v_add_nc_i16 v5, s1, s2 +v_add_nc_i16 v5.l, s1, s2 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] -v_add_nc_i16 v5, s105, s105 +v_add_nc_i16 v5.l, s105, s105 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] -v_add_nc_i16 v5, vcc_lo, ttmp15 +v_add_nc_i16 v5.l, vcc_lo, ttmp15 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] -v_add_nc_i16 v5, vcc_hi, 0xfe0b +v_add_nc_i16 v5.l, vcc_hi, 0xfe0b // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_add_nc_i16 v5, ttmp15, src_scc +v_add_nc_i16 v5.l, ttmp15, src_scc // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] -v_add_nc_i16 v5, m0, 0.5 +v_add_nc_i16 v5.l, m0, 0.5 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00] -v_add_nc_i16 v5, exec_lo, -1 +v_add_nc_i16 v5.l, exec_lo, -1 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] -v_add_nc_i16 v5, exec_hi, null +v_add_nc_i16 v5.l, exec_hi, null // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] -v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1] +v_add_nc_i16 v5.l, null, exec_lo +// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] + +v_add_nc_i16 v5.l, -1, exec_hi +// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00] + +v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX11: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00] -v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0] +v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00] -v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] +v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX11: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00] -v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] -v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_add_nc_i16 v5.l, src_scc, vcc_lo +// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00] + +v_add_nc_i16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_i16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] + +v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp // GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_add_nc_i32 v5, v1, v2 @@ -334,49 +352,67 @@ v_add_nc_i32 v5, src_scc, vcc_lo v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp // GFX11: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_add_nc_u16 v5, v1, v2 -// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] +v_add_nc_u16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] -v_add_nc_u16 v5, v255, v255 -// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] +v_add_nc_u16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] -v_add_nc_u16 v5, s1, s2 +v_add_nc_u16 v5.l, s1, s2 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] -v_add_nc_u16 v5, s105, s105 +v_add_nc_u16 v5.l, s105, s105 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] -v_add_nc_u16 v5, vcc_lo, ttmp15 +v_add_nc_u16 v5.l, vcc_lo, ttmp15 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] -v_add_nc_u16 v5, vcc_hi, 0xfe0b +v_add_nc_u16 v5.l, vcc_hi, 0xfe0b // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_add_nc_u16 v5, ttmp15, src_scc +v_add_nc_u16 v5.l, ttmp15, src_scc // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] -v_add_nc_u16 v5, m0, 0.5 +v_add_nc_u16 v5.l, m0, 0.5 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00] -v_add_nc_u16 v5, exec_lo, -1 +v_add_nc_u16 v5.l, exec_lo, -1 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] -v_add_nc_u16 v5, exec_hi, null +v_add_nc_u16 v5.l, exec_hi, null // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] -v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1] +v_add_nc_u16 v5.l, null, exec_lo +// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] + +v_add_nc_u16 v5.l, -1, exec_hi +// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00] + +v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX11: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00] -v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0] +v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00] -v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] +v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX11: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00] -v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX11: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] -v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_add_nc_u16 v5.l, src_scc, vcc_lo +// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00] + +v_add_nc_u16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] + +v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp // GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_alignbit_b32 v5, v1, v2, s3 @@ -5801,49 +5837,67 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp // GFX11: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_sub_nc_i16 v5, v1, v2 -// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] +v_sub_nc_i16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] -v_sub_nc_i16 v5, v255, v255 -// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] +v_sub_nc_i16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] -v_sub_nc_i16 v5, s1, s2 +v_sub_nc_i16 v5.l, s1, s2 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] -v_sub_nc_i16 v5, s105, s105 +v_sub_nc_i16 v5.l, s105, s105 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] -v_sub_nc_i16 v5, vcc_lo, ttmp15 +v_sub_nc_i16 v5.l, vcc_lo, ttmp15 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] -v_sub_nc_i16 v5, vcc_hi, 0xfe0b +v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_sub_nc_i16 v5, ttmp15, src_scc +v_sub_nc_i16 v5.l, ttmp15, src_scc // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] -v_sub_nc_i16 v5, m0, 0.5 +v_sub_nc_i16 v5.l, m0, 0.5 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00] -v_sub_nc_i16 v5, exec_lo, -1 +v_sub_nc_i16 v5.l, exec_lo, -1 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] -v_sub_nc_i16 v5, exec_hi, null +v_sub_nc_i16 v5.l, exec_hi, null // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] -v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1] +v_sub_nc_i16 v5.l, null, exec_lo +// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] + +v_sub_nc_i16 v5.l, -1, exec_hi +// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00] + +v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX11: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00] -v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0] +v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00] -v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] +v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX11: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00] -v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] -v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_sub_nc_i16 v5.l, src_scc, vcc_lo +// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00] + +v_sub_nc_i16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_i16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] + +v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp // GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_sub_nc_i32 v5, v1, v2 @@ -5891,49 +5945,67 @@ v_sub_nc_i32 v5, src_scc, vcc_lo v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp // GFX11: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_sub_nc_u16 v5, v1, v2 -// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] +v_sub_nc_u16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] -v_sub_nc_u16 v5, v255, v255 -// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] +v_sub_nc_u16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] -v_sub_nc_u16 v5, s1, s2 +v_sub_nc_u16 v5.l, s1, s2 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] -v_sub_nc_u16 v5, s105, s105 +v_sub_nc_u16 v5.l, s105, s105 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] -v_sub_nc_u16 v5, vcc_lo, ttmp15 +v_sub_nc_u16 v5.l, vcc_lo, ttmp15 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] -v_sub_nc_u16 v5, vcc_hi, 0xfe0b +v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_sub_nc_u16 v5, ttmp15, src_scc +v_sub_nc_u16 v5.l, ttmp15, src_scc // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] -v_sub_nc_u16 v5, m0, 0.5 +v_sub_nc_u16 v5.l, m0, 0.5 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00] -v_sub_nc_u16 v5, exec_lo, -1 +v_sub_nc_u16 v5.l, exec_lo, -1 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] -v_sub_nc_u16 v5, exec_hi, null +v_sub_nc_u16 v5.l, exec_hi, null // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] -v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1] +v_sub_nc_u16 v5.l, null, exec_lo +// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] + +v_sub_nc_u16 v5.l, -1, exec_hi +// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00] + +v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX11: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00] -v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0] +v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00] -v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] +v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX11: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00] -v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX11: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] -v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_sub_nc_u16 v5.l, src_scc, vcc_lo +// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00] + +v_sub_nc_u16 v5.l, v1.h, v2.l +// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5.l, v255.l, v255.h +// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] + +v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp // GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_subrev_co_u32 v5, s6, v1, v2 diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s index 3c693c556194e1528ee7842699e709786c85298a..c82b61e21edf6494842f0e745a75d0a8ac4e9316 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s @@ -194,47 +194,47 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] -v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -278,47 +278,47 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -4116,47 +4116,47 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask: v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -4200,47 +4200,47 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -4475,30 +4475,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 // GFX11: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] @@ -4724,30 +4700,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 // GFX11: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 // GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s index 79709278bc0c7b272c5472ba91e0a38166925414..73369685a0e6dcabbb75107bb018635139ca7818 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s @@ -114,14 +114,23 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -132,14 +141,23 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -2601,14 +2619,23 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -2619,14 +2646,23 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -2748,30 +2784,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX11: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX11: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] @@ -2997,30 +3009,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 // GFX11: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX11: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX11: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] // GFX11: encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index f28933ec3a8945ff1b534fa209adc7c7a9c118f0..1ae1eaf1ceeadd04937088b03cca009796d737ec 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -208,49 +208,58 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1 v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null // GFX12: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] -v_add_nc_i16 v5, v1, v2 +v_add_nc_i16 v5.l, v1.l, v2.l // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] -v_add_nc_i16 v5, v255, v255 +v_add_nc_i16 v5.l, v1.h, v2.l +// GFX12: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_i16 v5.l, v255.l, v255.l // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] -v_add_nc_i16 v5, s1, s2 +v_add_nc_i16 v5.l, v255.l, v255.h +// GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] + +v_add_nc_i16 v5.l, s1, s2 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] -v_add_nc_i16 v5, s105, s105 +v_add_nc_i16 v5.l, s105, s105 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] -v_add_nc_i16 v5, vcc_lo, ttmp15 +v_add_nc_i16 v5.l, vcc_lo, ttmp15 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] -v_add_nc_i16 v5, vcc_hi, 0xfe0b +v_add_nc_i16 v5.l, vcc_hi, 0xfe0b // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_add_nc_i16 v5, ttmp15, src_scc +v_add_nc_i16 v5.l, ttmp15, src_scc // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] -v_add_nc_i16 v5, m0, 0.5 +v_add_nc_i16 v5.l, m0, 0.5 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00] -v_add_nc_i16 v5, exec_lo, -1 +v_add_nc_i16 v5.l, exec_lo, -1 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] -v_add_nc_i16 v5, exec_hi, null +v_add_nc_i16 v5.l, exec_hi, null // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] -v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1] +v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX12: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00] -v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0] +v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00] -v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] +v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX12: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00] -v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] -v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp // GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_add_nc_i32 v5, v1, v2 @@ -298,49 +307,58 @@ v_add_nc_i32 v5, src_scc, vcc_lo v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp // GFX12: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_add_nc_u16 v5, v1, v2 +v_add_nc_u16 v5.l, v1.l, v2.l // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] -v_add_nc_u16 v5, v255, v255 +v_add_nc_u16 v5.l, v1.h, v2.l +// GFX12: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5.l, v255.l, v255.l // GFX12: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] -v_add_nc_u16 v5, s1, s2 +v_add_nc_u16 v5.l, v255.l, v255.h +// GFX12: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] + +v_add_nc_u16 v5.l, s1, s2 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] -v_add_nc_u16 v5, s105, s105 +v_add_nc_u16 v5.l, s105, s105 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] -v_add_nc_u16 v5, vcc_lo, ttmp15 +v_add_nc_u16 v5.l, vcc_lo, ttmp15 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] -v_add_nc_u16 v5, vcc_hi, 0xfe0b +v_add_nc_u16 v5.l, vcc_hi, 0xfe0b // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_add_nc_u16 v5, ttmp15, src_scc +v_add_nc_u16 v5.l, ttmp15, src_scc // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] -v_add_nc_u16 v5, m0, 0.5 +v_add_nc_u16 v5.l, m0, 0.5 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00] -v_add_nc_u16 v5, exec_lo, -1 +v_add_nc_u16 v5.l, exec_lo, -1 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] -v_add_nc_u16 v5, exec_hi, null +v_add_nc_u16 v5.l, exec_hi, null // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] -v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1] +v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX12: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00] -v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0] +v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX12: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00] -v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] +v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX12: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00] -v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX12: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] -v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp // GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_alignbit_b32 v5, v1, v2, s3 @@ -5696,49 +5714,58 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp // GFX12: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_sub_nc_i16 v5, v1, v2 +v_sub_nc_i16 v5.l, v1.l, v2.l // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] -v_sub_nc_i16 v5, v255, v255 +v_sub_nc_i16 v5.l, v1.h, v2.l +// GFX12: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_i16 v5.l, v255.l, v255.l // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] -v_sub_nc_i16 v5, s1, s2 +v_sub_nc_i16 v5.l, v255.l, v255.h +// GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] + +v_sub_nc_i16 v5.l, s1, s2 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] -v_sub_nc_i16 v5, s105, s105 +v_sub_nc_i16 v5.l, s105, s105 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] -v_sub_nc_i16 v5, vcc_lo, ttmp15 +v_sub_nc_i16 v5.l, vcc_lo, ttmp15 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] -v_sub_nc_i16 v5, vcc_hi, 0xfe0b +v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_sub_nc_i16 v5, ttmp15, src_scc +v_sub_nc_i16 v5.l, ttmp15, src_scc // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] -v_sub_nc_i16 v5, m0, 0.5 +v_sub_nc_i16 v5.l, m0, 0.5 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00] -v_sub_nc_i16 v5, exec_lo, -1 +v_sub_nc_i16 v5.l, exec_lo, -1 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] -v_sub_nc_i16 v5, exec_hi, null +v_sub_nc_i16 v5.l, exec_hi, null // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] -v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1] +v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX12: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00] -v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0] +v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00] -v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0] +v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX12: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00] -v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] -v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp // GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_sub_nc_i32 v5, v1, v2 @@ -5786,49 +5813,58 @@ v_sub_nc_i32 v5, src_scc, vcc_lo v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp // GFX12: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] -v_sub_nc_u16 v5, v1, v2 +v_sub_nc_u16 v5.l, v1.l, v2.l // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] -v_sub_nc_u16 v5, v255, v255 +v_sub_nc_u16 v5.l, v1.h, v2.l +// GFX12: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5.l, v255.l, v255.l // GFX12: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] -v_sub_nc_u16 v5, s1, s2 +v_sub_nc_u16 v5.l, v255.l, v255.h +// GFX12: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] + +v_sub_nc_u16 v5.l, s1, s2 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] -v_sub_nc_u16 v5, s105, s105 +v_sub_nc_u16 v5.l, s105, s105 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] -v_sub_nc_u16 v5, vcc_lo, ttmp15 +v_sub_nc_u16 v5.l, vcc_lo, ttmp15 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] -v_sub_nc_u16 v5, vcc_hi, 0xfe0b +v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] -v_sub_nc_u16 v5, ttmp15, src_scc +v_sub_nc_u16 v5.l, ttmp15, src_scc // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] -v_sub_nc_u16 v5, m0, 0.5 +v_sub_nc_u16 v5.l, m0, 0.5 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00] -v_sub_nc_u16 v5, exec_lo, -1 +v_sub_nc_u16 v5.l, exec_lo, -1 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] -v_sub_nc_u16 v5, exec_hi, null +v_sub_nc_u16 v5.l, exec_hi, null // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] -v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1] +v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1] // GFX12: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00] -v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0] +v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0] // GFX12: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00] -v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0] +v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0] // GFX12: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00] -v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] +v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] // GFX12: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] -v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp +// GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp // GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] v_subrev_co_u32 v5, s6, v1, v2 diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index adf37901fc85fd4cfc18c8c38fc223229e5593d0..56bd0ee4b47465c03546b3f20259710d777245ff 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -214,47 +214,71 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30] -v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] -v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 -// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -298,47 +322,71 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] -v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 -// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -4622,47 +4670,71 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask: v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] -v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 -// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -4706,47 +4778,71 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 +// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] -v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 +// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 -// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] -v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 +// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] -v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] + +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13] + +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0] // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -5001,30 +5097,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 // GFX12: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13] @@ -5250,30 +5322,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 // GFX12: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 -// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] - -v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 -// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] - v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 // GFX12: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index 1be122faccbc9296894c932d29fec414d53fc016..6331d22c6976d9c7bbc0aa4efd911fbed419c0d3 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -134,14 +134,38 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] -v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX12: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -152,14 +176,38 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX12: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -3043,14 +3091,38 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX12: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -3061,14 +3133,38 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 +// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX12: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -3210,30 +3306,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] @@ -3459,30 +3531,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 // GFX12: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] - -v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 -// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] - v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] // GFX12: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] diff --git a/llvm/test/MC/ARM/vlstm-vlldm-diag.s b/llvm/test/MC/ARM/vlstm-vlldm-diag.s index b57f535c6a25cf7d4ee479a795b9ec6204e696aa..7aa48b96ff2f69fa3e71fc1601703065b6f7e46b 100644 --- a/llvm/test/MC/ARM/vlstm-vlldm-diag.s +++ b/llvm/test/MC/ARM/vlstm-vlldm-diag.s @@ -36,6 +36,14 @@ vlldm r8, {d3 - d31} // ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2) // ERR-NEXT: vlldm r8, {d3 - d31} +vlstm r8, {d31} +// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2) +// ERR-NEXT: vlstm r8, {d31} + +vlldm r8, {d31} +// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2) +// ERR-NEXT: vlldm r8, {d31} + vlstm r8, {d0 - d35} // ERR: error: register expected // ERR-NEXT: vlstm r8, {d0 - d35} diff --git a/llvm/test/MC/ARM/vscclrm-asm.s b/llvm/test/MC/ARM/vscclrm-asm.s index 0989b38b07c06efe7091e943fadf537f2b677b6a..0d2054df4fd345510f1851d0c19fa42564c9bc62 100644 --- a/llvm/test/MC/ARM/vscclrm-asm.s +++ b/llvm/test/MC/ARM/vscclrm-asm.s @@ -35,11 +35,62 @@ it hi // CHECK: vscclrmhi {s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, vpr} @ encoding: [0xdf,0xec,0x1d,0x1a] vscclrmhi {s3-s31, vpr} +// CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0a] +vscclrm {vpr} + +// CHECK: vscclrm {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0b] +vscclrm {d0-d31, vpr} + +// CHECK: vscclrm {d31, vpr} @ encoding: [0xdf,0xec,0x02,0xfb] +vscclrm {d31, vpr} + +// CHECK: vscclrm {s31, d16, vpr} @ encoding: [0xdf,0xec,0x03,0xfa] +vscclrm {s31, d16, vpr} + +// CHECK: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a] +vscclrm {s0-s31, d16-d31, vpr} + +// CHECK: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a] +vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} + // ERROR: non-contiguous register range vscclrm {s0, s3-s4, vpr} +// ERROR: non-contiguous register range +vscclrm {s31, d16, s30, vpr} + // ERROR: register expected vscclrm {s32, vpr} +// ERROR: register expected +vscclrm {d32, vpr} + +// ERROR: register expected +vscclrm {s31-s32, vpr} + +// ERROR: register expected +vscclrm {d31-d32, vpr} + // ERROR: invalid operand for instruction vscclrm {s0-s1} + +// ERROR: register list not in ascending order +vscclrm {vpr, s0} + +// ERROR: register list not in ascending order +vscclrm {vpr, s31} + +// ERROR: register list not in ascending order +vscclrm {vpr, d0} + +// ERROR: register list not in ascending order +vscclrm {vpr, d31} + +// ERROR: invalid register in register list +vscclrm {s0, d0, vpr} + +// ERROR: invalid register in register list +vscclrm {s0, d1, vpr} + +// ERROR: invalid register in register list +vscclrm {d16, s31, vpr} diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt index 07058a6451592015e449b5d1be5c224afd01ad3c..365caa5f9b6d09824446c648a7c443205330906d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt @@ -189,49 +189,112 @@ # GFX11: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] 0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf -# GFX11: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] -0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00 - -# GFX11: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] -0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00 - -# GFX11: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00 -# GFX11: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00 -# GFX11: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00 -# GFX11: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00 -# GFX11: v_add_nc_i16 v5, m0, 0x3800 +# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00 -# GFX11: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00 -# GFX11: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00 -# GFX11: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00 -# GFX11: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00] -0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00 - -# GFX11: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] -0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00 - -# GFX11: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] -0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00 +# W32-REAL16: v_add_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00 + +# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +0x05,0x00,0x0d,0xd7,0xf0,0xfa,0x00,0x00 + +# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00 + +# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_add_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00] @@ -279,49 +342,112 @@ # GFX11: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX11: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] -0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00 - -# GFX11: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] -0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00 - -# GFX11: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00 -# GFX11: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00 -# GFX11: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00 -# GFX11: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00 -# GFX11: v_add_nc_u16 v5, m0, 0x3800 +# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00 -# GFX11: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00 -# GFX11: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00 -# GFX11: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00 -# GFX11: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00] -0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00 - -# GFX11: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] -0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00 - -# GFX11: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] -0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00 +# W32-REAL16: v_add_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00] +0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00 + +# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +0x05,0x00,0x03,0xd7,0xf0,0xfa,0x00,0x00 + +# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00] +0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00 + +# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_alignbit_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00] @@ -5871,49 +5997,112 @@ # GFX11: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX11: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] -0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00 - -# GFX11: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] -0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00 - -# GFX11: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00 -# GFX11: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00 -# GFX11: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00 -# GFX11: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00 -# GFX11: v_sub_nc_i16 v5, m0, 0x3800 +# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00 -# GFX11: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00 -# GFX11: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00 -# GFX11: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00 -# GFX11: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00] -0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00 - -# GFX11: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] -0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00 - -# GFX11: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] -0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00 +# W32-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00 + +# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +0x05,0x00,0x0e,0xd7,0xf0,0xfa,0x00,0x00 + +# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00 + +# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX11: v_sub_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00] @@ -5961,49 +6150,112 @@ # GFX11: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX11: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] -0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00 - -# GFX11: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] -0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00 - -# GFX11: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00 -# GFX11: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00 -# GFX11: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00 -# GFX11: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00 -# GFX11: v_sub_nc_u16 v5, m0, 0x3800 +# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00 -# GFX11: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00 -# GFX11: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00 -# GFX11: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00 -# GFX11: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00] -0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00 - -# GFX11: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] -0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00 - -# GFX11: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] -0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00 +# W32-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00] +0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +0x05,0x00,0x04,0xd7,0xf0,0xfa,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00] +0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 -# GFX11: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # W32: v_subrev_co_u32 v5, s12, v1, v2 ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt index 4ae8b053f0e0f3e7ec4e1c9f12fa3fd8c195a227..d0bd6398ad10a0132fb7ff1fdd06bcac93ab5a67 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt @@ -3824,88 +3824,220 @@ # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -4664,88 +4796,220 @@ # GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt index b44dba748666c750d2d6ddaad60da00fc304e97c..cbf5a3d11e50b88d4f5846ce2e50e63aba9b9dc8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt @@ -2168,34 +2168,112 @@ # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -2840,34 +2918,112 @@ # GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index af04a31423b6fea36674282a7f3c5d01e9a03474..c87c8855f5cdc0d0085424274933fedf2f272b79 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -153,49 +153,112 @@ # GFX12: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf] 0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf -# GFX12: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00 -# GFX12: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00] 0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00 -# GFX12: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00 -# GFX12: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00 -# GFX12: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00 -# GFX12: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00 -# GFX12: v_add_nc_i16 v5, m0, 0x3800 +# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00 -# GFX12: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00 -# GFX12: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00 -# GFX12: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00 -# GFX12: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00] 0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00 -# GFX12: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] +# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00 -# GFX12: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00] 0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00 -# GFX12: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 + +# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_add_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00] @@ -243,49 +306,112 @@ # GFX12: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX12: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00 -# GFX12: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00] 0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00 -# GFX12: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00 -# GFX12: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00 -# GFX12: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00 -# GFX12: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00 -# GFX12: v_add_nc_u16 v5, m0, 0x3800 +# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00 -# GFX12: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00 -# GFX12: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00 -# GFX12: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00 -# GFX12: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00] 0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00 -# GFX12: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] +# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00 -# GFX12: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00] 0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00 -# GFX12: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 + +# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_alignbit_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00] @@ -5797,49 +5923,112 @@ # GFX12: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX12: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00 -# GFX12: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00] 0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00 -# GFX12: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, s1, s2 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, s105, s105 ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00 -# GFX12: v_sub_nc_i16 v5, m0, 0x3800 +# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00 -# GFX12: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00 -# GFX12: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00] 0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] +# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00 -# GFX12: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00] 0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00 -# GFX12: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 + +# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # GFX12: v_sub_nc_i32 v5, v1, v2 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00] @@ -5887,49 +6076,112 @@ # GFX12: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf -# GFX12: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00 -# GFX12: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00] 0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00 -# GFX12: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, s1, s2 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00] 0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, s105, s105 ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00] 0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00] 0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00 -# GFX12: v_sub_nc_u16 v5, m0, 0x3800 +# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00 -# GFX12: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00] 0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00 -# GFX12: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00] 0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00] 0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00] 0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] +# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00] 0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00 -# GFX12: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00] 0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00 -# GFX12: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00 + +# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00] +0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00 + +# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00 # W32: v_subrev_co_u32 v5, s12, v1, v2 ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index 65cfdd5ef7de0343d77b9926cf85ba940910a8c7..5081b9811e43efcf6e58f87d01d65f6f17d168f0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -4115,88 +4115,268 @@ # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] @@ -5000,88 +5180,268 @@ # GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30] 0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30 -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff] 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff] 0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01] 0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01 -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13] 0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13 -# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 # GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index 4640b967cbc07bf7db8b159e2215a73f40d0bb5a..77f05027d1cfd2be1964cc5bdf4f4379e3d56ab0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -2393,34 +2393,160 @@ # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] @@ -3113,34 +3239,160 @@ # GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] 0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 -# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 # GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] diff --git a/llvm/test/MC/Disassembler/ARM/vscclrm.txt b/llvm/test/MC/Disassembler/ARM/vscclrm.txt index 8a89cfb76e4a4576d87612a5b72e8ba7e645c2a6..ef3868eb1569febbe724f482cc20193c0216ce86 100644 --- a/llvm/test/MC/Disassembler/ARM/vscclrm.txt +++ b/llvm/test/MC/Disassembler/ARM/vscclrm.txt @@ -1,5 +1,7 @@ -# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+8msecext -show-encoding %s 2>&1 | FileCheck %s -# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+8msecext -show-encoding %s 2>&1 | FileCheck %s +# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+8msecext -show-encoding %s 2> %t | FileCheck %s +# RUN: FileCheck --check-prefix=WARN < %t %s +# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+8msecext -show-encoding %s 2> %t | FileCheck %s +# RUN: FileCheck --check-prefix=WARN < %t %s [0x9f 0xec 0x04 0x0a] # CHECK: vscclrm {s0, s1, s2, s3, vpr} @@ -27,3 +29,50 @@ [0xdf 0xec 0x1d 0x1a] # CHECK: vscclrmhi {s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, vpr} + +[0xdf,0xec,0x03,0xfa] +# CHECK: vscclrm {s31, d16, vpr} @ encoding: [0xdf,0xec,0x03,0xfa] + +[0x9f,0xec,0x40,0x0a] +# CHECK: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a] + +# If the list size is zero then we get a list of only vpr, and the Vd register +# doesn't matter. + +[0x9f,0xec,0x00,0x0b] +# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0b] + +[0xdf,0xec,0x00,0xfb] +# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0b] + +[0x9f,0xec,0x00,0x0a] +# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0a] + +[0xdf,0xec,0x00,0xfa] +# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0a] + +# In double-precision if Vd+size goes past 31 the excess registers are ignored +# and we get a warning. + +[0x9f,0xec,0xfe,0x0b] +# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding +# CHECK: vscclrm {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0b] + +[0xdf,0xec,0x04,0xfb] +# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding +# CHECK: vscclrm {d31, vpr} @ encoding: [0xdf,0xec,0x02,0xfb] + +# In single-precision if Vd+size goes past 63, or if the encoding suggests half +# a d registers, then the excess registers are ignored and we get a warning. + +[0x9f,0xec,0xff,0x0a] +# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding +# CHECK: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a] + +[0xdf,0xec,0x02,0xfa] +# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding +# CHECK: vscclrm {s31, vpr} @ encoding: [0xdf,0xec,0x01,0xfa] + +[0xdf,0xec,0x23,0xfa] +# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding +vscclrm {s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0xdf,0xec,0x21,0xfa] diff --git a/llvm/test/MC/RISCV/rv32c-valid.s b/llvm/test/MC/RISCV/rv32c-valid.s index bcdf27a2ba783be65adfd41774d8b88f3ee76e7f..9b0ca80a7adc20b4ee76bc8f9988ded9c6a84135 100644 --- a/llvm/test/MC/RISCV/rv32c-valid.s +++ b/llvm/test/MC/RISCV/rv32c-valid.s @@ -147,8 +147,7 @@ c.sub a4, a5 # CHECK-ASM: encoding: [0x01,0x00] # CHECK-NO-EXT: error: instruction requires the following: 'C' (Compressed Instructions) or 'Zca' (part of the C extension, excluding compressed floating point loads/stores){{$}} c.nop -# CHECK-ASM: c.addi zero, 0 -# CHECK-OBJ: c.nop +# CHECK-ASM-AND-OBJ: c.nop # CHECK-ASM: encoding: [0x01,0x00] # CHECK-NO-EXT: error: instruction requires the following: 'C' (Compressed Instructions) or 'Zca' (part of the C extension, excluding compressed floating point loads/stores){{$}} c.addi x0, 0 diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll new file mode 100644 index 0000000000000000000000000000000000000000..3de502874d3237d691886c21c77b89fc77ee3cad --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll @@ -0,0 +1,1523 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX900 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX12 %s + +; -------------------------------------------------------------------- +; Idempotent expansion cases without noalias.addrspace +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_add_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; -------------------------------------------------------------------- +; Idempotent expansion cases with noalias.addrspace +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; -------------------------------------------------------------------- +; General expansion for add +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_add_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; -------------------------------------------------------------------- +; General expansion for xchg +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_xchg_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; -------------------------------------------------------------------- +; General expansion for xchg (pointer type) +; -------------------------------------------------------------------- + +define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(ptr %ptr, ptr addrspace(1) %value) { +; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent( +; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret ptr addrspace(1) [[RES]] +; + %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret ptr addrspace(1) %res +} + +define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(1) %value) { +; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret ptr addrspace(1) [[RES]] +; + %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret ptr addrspace(1) %res +} + +define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(3) %value) { +; ALL-LABEL: define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(3) [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(3) [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret ptr addrspace(3) [[RES]] +; + %res = atomicrmw xchg ptr %ptr, ptr addrspace(3) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret ptr addrspace(3) %res +} + +; -------------------------------------------------------------------- +; General expansion for and +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_and_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret i64 %res +} + + +define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw and ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; -------------------------------------------------------------------- +; General expansion for fadd +; -------------------------------------------------------------------- + +define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP5]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP5]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP5]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret double %res +} + +define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { +; GFX7-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret float [[TMP5]] +; +; GFX900-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label %[[ATOMICRMW_SHARED:.*]], label %[[ATOMICRMW_CHECK_PRIVATE:.*]] +; GFX90A: [[ATOMICRMW_SHARED]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_CHECK_PRIVATE]]: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4 +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]] +; GFX90A-NEXT: store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], %[[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret float [[LOADED_PHI]] +; +; GFX940-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fadd ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr %ptr, <2 x half> %value) { +; GFX7-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret <2 x half> [[TMP5]] +; +; GFX900-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret <2 x half> [[TMP5]] +; +; GFX940-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX12-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(ptr %ptr, <2 x bfloat> %value) { +; GFX7-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX900-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX12-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +; -------------------------------------------------------------------- +; General expansion for fmin +; -------------------------------------------------------------------- + +define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP6]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret double %res +} + +define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { +; GFX7-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret float [[RES]] +; +; GFX900-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[LOADED_PHI:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret float [[LOADED_PHI]] +; +; GFX940-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX940-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX940: [[ATOMICRMW_START]]: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX940: [[ATOMICRMW_END]]: +; GFX940-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +; -------------------------------------------------------------------- +; General expansion for fmax +; -------------------------------------------------------------------- + +define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP6]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret double %res +} + +define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { +; GFX7-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret float [[RES]] +; +; GFX900-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[LOADED_PHI:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret float [[LOADED_PHI]] +; +; GFX940-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX940-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX940: [[ATOMICRMW_START]]: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX940: [[ATOMICRMW_END]]: +; GFX940-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +; -------------------------------------------------------------------- +; General expansion for nand +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_nand_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret i64 %res +} + + +define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR]], align 4 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +!0 = !{} +!1 = !{i32 5, i32 6} + +;. +; GFX7: [[META0]] = !{} +; GFX7: [[META1]] = !{i32 5, i32 6} +;. +; GFX900: [[META0]] = !{} +; GFX900: [[META1]] = !{i32 5, i32 6} +;. +; GFX908: [[META0]] = !{} +; GFX908: [[META1]] = !{i32 5, i32 6} +;. +; GFX90A: [[META0]] = !{} +; GFX90A: [[META1]] = !{i32 5, i32 6} +;. +; GFX940: [[META0]] = !{} +; GFX940: [[META1]] = !{i32 5, i32 6} +;. +; GFX12: [[META0]] = !{} +; GFX12: [[META1]] = !{i32 5, i32 6} +;. diff --git a/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll b/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll index 19122b920d1ca4abeadb8d09def41049b22c598b..9ad0418025e56cd6da85b406e22a13bca849d3eb 100644 --- a/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll +++ b/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll @@ -1,9 +1,16 @@ ;; Ensure __llvm_gcov_(writeout|reset|init) have the correct !kcfi_type ;; with integer normalization. ; RUN: mkdir -p %t && cd %t -; RUN: opt < %s -S -passes=insert-gcov-profiling | FileCheck %s +; RUN: opt < %s -S -passes=insert-gcov-profiling \ +; RUN: -mtriple=x86_64-unknown-linux-gnu | FileCheck \ +; RUN: --check-prefixes=CHECK,CHECK-CTOR-INIT %s +; RUN: opt < %s -S -passes=insert-gcov-profiling \ +; RUN: -mtriple=powerpc64-ibm-aix | FileCheck \ +; RUN: --check-prefixes=CHECK,CHECK-RT-INIT %s -target triple = "x86_64-unknown-linux-gnu" +; Check for gcov initialization function pointers when we initialize +; the writeout and reset functions in the runtime. +; CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit" define dso_local void @empty() !dbg !5 { entry: @@ -29,7 +36,7 @@ entry: ; CHECK-SAME: !kcfi_type ![[#TYPE:]] ; CHECK: define internal void @__llvm_gcov_reset() ; CHECK-SAME: !kcfi_type ![[#TYPE]] -; CHECK: define internal void @__llvm_gcov_init() -; CHECK-SAME: !kcfi_type ![[#TYPE]] +; CHECK-CTOR-INIT: define internal void @__llvm_gcov_init() +; CHECK-CTOR-INIT-SAME: !kcfi_type ![[#TYPE]] ; CHECK: ![[#TYPE]] = !{i32 -440107680} diff --git a/llvm/test/Transforms/GCOVProfiling/kcfi.ll b/llvm/test/Transforms/GCOVProfiling/kcfi.ll index 1b97d25294cd6573d162ccdb3ac9579af340570f..5e0e91fc92f5f71be0c54706aac3b1389eebdd92 100644 --- a/llvm/test/Transforms/GCOVProfiling/kcfi.ll +++ b/llvm/test/Transforms/GCOVProfiling/kcfi.ll @@ -1,8 +1,15 @@ ;; Ensure __llvm_gcov_(writeout|reset|init) have !kcfi_type with KCFI. ; RUN: mkdir -p %t && cd %t -; RUN: opt < %s -S -passes=insert-gcov-profiling | FileCheck %s +; RUN: opt < %s -S -passes=insert-gcov-profiling \ +; RUN: -mtriple=x86_64-unknown-linux-gnu | FileCheck \ +; RUN: --check-prefixes=CHECK,CHECK-CTOR-INIT %s +; RUN: opt < %s -S -passes=insert-gcov-profiling \ +; RUN: -mtriple=powerpc64-ibm-aix | FileCheck \ +; RUN: --check-prefixes=CHECK,CHECK-RT-INIT %s -target triple = "x86_64-unknown-linux-gnu" +; Check for gcov initialization function pointers when we initialize +; the writeout and reset functions in the runtime. +; CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit" define dso_local void @empty() !dbg !5 { entry: @@ -27,7 +34,7 @@ entry: ; CHECK-SAME: !kcfi_type ![[#TYPE:]] ; CHECK: define internal void @__llvm_gcov_reset() ; CHECK-SAME: !kcfi_type ![[#TYPE]] -; CHECK: define internal void @__llvm_gcov_init() -; CHECK-SAME: !kcfi_type ![[#TYPE]] +; CHECK-CTOR-INIT: define internal void @__llvm_gcov_init() +; CHECK-CTOR-INIT-SAME: !kcfi_type ![[#TYPE]] ; CHECK: ![[#TYPE]] = !{i32 -1522505972} diff --git a/llvm/test/Transforms/GCOVProfiling/module-flags.ll b/llvm/test/Transforms/GCOVProfiling/module-flags.ll index 919dd41ea2034885dd65b9ad02050ab20c716e9d..59f116d0d7e655fa43fcf42054e35374d2f2f4f3 100644 --- a/llvm/test/Transforms/GCOVProfiling/module-flags.ll +++ b/llvm/test/Transforms/GCOVProfiling/module-flags.ll @@ -1,7 +1,14 @@ ; RUN: mkdir -p %t && cd %t -; RUN: opt < %s -S -passes=insert-gcov-profiling | FileCheck %s +; RUN: opt < %s -S -passes=insert-gcov-profiling \ +; RUN: -mtriple=x86_64-unknown-linux-gnu | FileCheck \ +; RUN: --check-prefixes=CHECK,CHECK-CTOR-INIT %s +; RUN: opt < %s -S -passes=insert-gcov-profiling \ +; RUN: -mtriple=powerpc64-ibm-aix | FileCheck \ +; RUN: --check-prefixes=CHECK,CHECK-RT-INIT %s -target triple = "x86_64-unknown-linux-gnu" +; Check for gcov initialization function pointers when we initialize +; the writeout and reset functions in the runtime. +; CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit" define dso_local void @empty() !dbg !5 { entry: @@ -30,5 +37,5 @@ entry: ;; Infer uwtable and "frame-pointer" from the module flags. ; CHECK: define internal void @__llvm_gcov_writeout() unnamed_addr #[[#ATTR:]] ; CHECK: define internal void @__llvm_gcov_reset() unnamed_addr #[[#ATTR]] -; CHECK: define internal void @__llvm_gcov_init() unnamed_addr #[[#ATTR]] +; CHECK-CTOR-INIT: define internal void @__llvm_gcov_init() unnamed_addr #[[#ATTR]] ; CHECK: attributes #[[#ATTR]] = { noinline nounwind uwtable "frame-pointer"="all" } diff --git a/llvm/test/Transforms/GlobalOpt/ctor-list-preserve-addrspace.ll b/llvm/test/Transforms/GlobalOpt/ctor-list-preserve-addrspace.ll new file mode 100644 index 0000000000000000000000000000000000000000..3f2f041b90e74de984cf5725831a57037d8fc0c6 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/ctor-list-preserve-addrspace.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -passes=globalopt < %s | FileCheck %s + +; Make sure the address space of global_ctors is preserved + +%ini = type { i32, ptr, ptr } + +@llvm.global_ctors = appending addrspace(1) global [1 x %ini] [%ini { i32 65534, ptr @ctor1, ptr null }] + +;. +; CHECK: @llvm.global_ctors = appending addrspace(1) global [0 x %ini] zeroinitializer +;. +define void @ctor1() { +; CHECK-LABEL: define void @ctor1() local_unnamed_addr { +; CHECK-NEXT: ret void +; + ret void +} + diff --git a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll index 5051c92345ec75080c41deb8af64c6062ca68453..5bf845d5ba94b335bb18d0f7713e2fbe7f3e3773 100644 --- a/llvm/test/Transforms/Inline/access-attributes-prop.ll +++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll @@ -47,7 +47,6 @@ define dso_local void @foo3_writable(ptr %p) { ret void } - define dso_local void @foo1_bar_aligned64_deref512(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@foo1_bar_aligned64_deref512 ; CHECK-SAME: (ptr [[P:%.*]]) { @@ -306,7 +305,7 @@ define void @prop_param_callbase_def_1x_partial_3(ptr %p, ptr %p2) { define void @prop_deref(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@prop_deref ; CHECK-SAME: (ptr [[P:%.*]]) { -; CHECK-NEXT: call void @bar1(ptr [[P]]) +; CHECK-NEXT: call void @bar1(ptr dereferenceable(16) [[P]]) ; CHECK-NEXT: ret void ; call void @foo1(ptr dereferenceable(16) %p) @@ -316,7 +315,7 @@ define void @prop_deref(ptr %p) { define void @prop_deref_or_null(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@prop_deref_or_null ; CHECK-SAME: (ptr [[P:%.*]]) { -; CHECK-NEXT: call void @bar1(ptr [[P]]) +; CHECK-NEXT: call void @bar1(ptr dereferenceable_or_null(256) [[P]]) ; CHECK-NEXT: ret void ; call void @foo1(ptr dereferenceable_or_null(256) %p) @@ -326,13 +325,23 @@ define void @prop_deref_or_null(ptr %p) { define void @prop_param_nonnull_and_align(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@prop_param_nonnull_and_align ; CHECK-SAME: (ptr [[P:%.*]]) { -; CHECK-NEXT: call void @bar1(ptr [[P]]) +; CHECK-NEXT: call void @bar1(ptr nonnull align 32 [[P]]) ; CHECK-NEXT: ret void ; call void @foo1(ptr nonnull align 32 %p) ret void } +define void @prop_param_nofree_and_align(ptr %p) { +; CHECK-LABEL: define {{[^@]+}}@prop_param_nofree_and_align +; CHECK-SAME: (ptr [[P:%.*]]) { +; CHECK-NEXT: call void @bar1(ptr align 32 [[P]]) +; CHECK-NEXT: ret void +; + call void @foo1(ptr nofree align 32 %p) + ret void +} + define void @prop_param_deref_align_no_update(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_no_update ; CHECK-SAME: (ptr [[P:%.*]]) { @@ -346,7 +355,7 @@ define void @prop_param_deref_align_no_update(ptr %p) { define void @prop_param_deref_align_update(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_update ; CHECK-SAME: (ptr [[P:%.*]]) { -; CHECK-NEXT: call void @bar1(ptr align 64 dereferenceable(512) [[P]]) +; CHECK-NEXT: call void @bar1(ptr align 128 dereferenceable(1024) [[P]]) ; CHECK-NEXT: ret void ; call void @foo1_bar_aligned64_deref512(ptr align 128 dereferenceable(1024) %p) @@ -356,7 +365,7 @@ define void @prop_param_deref_align_update(ptr %p) { define void @prop_param_deref_or_null_update(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_or_null_update ; CHECK-SAME: (ptr [[P:%.*]]) { -; CHECK-NEXT: call void @bar1(ptr align 512 dereferenceable_or_null(512) [[P]]) +; CHECK-NEXT: call void @bar1(ptr align 512 dereferenceable_or_null(1024) [[P]]) ; CHECK-NEXT: ret void ; call void @foo1_bar_aligned512_deref_or_null512(ptr dereferenceable_or_null(1024) %p) @@ -539,7 +548,6 @@ define void @prop_no_conflict_writable(ptr %p) { ret void } - define void @prop_no_conflict_writable2(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@prop_no_conflict_writable2 ; CHECK-SAME: (ptr [[P:%.*]]) { @@ -600,3 +608,145 @@ define void @prop_byval_readonly2(ptr %p) { call void @foo_byval_readonly2(ptr %p) ret void } + +declare void @bar5(i32) + +define dso_local void @foo4_range_0_10(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@foo4_range_0_10 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 0, 10) [[V]]) +; CHECK-NEXT: ret void +; + call void @bar5(i32 range(i32 0, 10) %v) + ret void +} + +define dso_local void @foo4_range_10_40(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@foo4_range_10_40 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 10, 40) [[V]]) +; CHECK-NEXT: ret void +; + call void @bar5(i32 range(i32 10, 40) %v) + ret void +} + +define dso_local void @foo4_2_range_0_10(i32 range(i32 0, 10) %v) { +; CHECK-LABEL: define {{[^@]+}}@foo4_2_range_0_10 +; CHECK-SAME: (i32 range(i32 0, 10) [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 [[V]]) +; CHECK-NEXT: ret void +; + call void @bar5(i32 %v) + ret void +} + +define dso_local void @foo4(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@foo4 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 [[V]]) +; CHECK-NEXT: ret void +; + call void @bar5(i32 %v) + ret void +} + +define void @prop_range_empty_intersect(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_intersect +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 0, 0) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_range_0_10(i32 range(i32 11, 50) %v) + ret void +} + +define void @prop_range_empty(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_empty +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 1, 0) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4(i32 range(i32 1, 0) %v) + ret void +} + +define void @prop_range_empty_with_intersect(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_with_intersect +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 1, 10) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_range_0_10(i32 range(i32 1, 0) %v) + ret void +} + +define void @prop_range_intersect1(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect1 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 0, 9) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_range_0_10(i32 range(i32 0, 9) %v) + ret void +} + +define void @prop_range_intersect2(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect2 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 1, 9) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_range_0_10(i32 range(i32 1, 9) %v) + ret void +} + +define void @prop_range_intersect3(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect3 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 0, 11) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_2_range_0_10(i32 range(i32 0, 11) %v) + ret void +} + +define void @prop_range_intersect4(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect4 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 0, 5) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_range_0_10(i32 range(i32 40, 5) %v) + ret void +} + +define void @prop_range_intersect5(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect5 +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 10, 40) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_range_10_40(i32 range(i32 30, 20) %v) + ret void +} + +define void @prop_range_keep(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_keep +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 10, 40) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4_range_10_40(i32 %v) + ret void +} + +define void @prop_range_direct(i32 %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_range_direct +; CHECK-SAME: (i32 [[V:%.*]]) { +; CHECK-NEXT: call void @bar5(i32 range(i32 1, 11) [[V]]) +; CHECK-NEXT: ret void +; + call void @foo4(i32 range(i32 1, 11) %v) + ret void +} diff --git a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll index 1a219a22019c4358ac05722d9e959c7b886a489d..c0943f4aefb8f9dcc433bb03549d974730e8f2ff 100644 --- a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll +++ b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll @@ -8,7 +8,7 @@ declare void @h(ptr %p, ptr %q, ptr %z) define void @f(ptr %p, ptr %q, ptr %z) { ; CHECK-LABEL: define void @f ; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[Z:%.*]]) { -; CHECK-NEXT: call void @h(ptr [[P]], ptr [[Q]], ptr [[Z]]) +; CHECK-NEXT: call void @h(ptr nonnull [[P]], ptr [[Q]], ptr nonnull [[Z]]) ; CHECK-NEXT: ret void ; call void @g(ptr nonnull %p, ptr %q, ptr nonnull %z) diff --git a/llvm/test/Transforms/Inline/byval.ll b/llvm/test/Transforms/Inline/byval.ll index dd5be40b90a8f2ae6b5816d11aed4212a6c7d763..1a70da8472cb1ee6562d461f6f3a85491ee72be1 100644 --- a/llvm/test/Transforms/Inline/byval.ll +++ b/llvm/test/Transforms/Inline/byval.ll @@ -106,7 +106,7 @@ define void @test3() nounwind { ; CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_SS]], align 1 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[S1]]) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1]], ptr align 1 [[S]], i64 12, i1 false) -; CHECK-NEXT: call void @g3(ptr [[S1]]) #[[ATTR0]] +; CHECK-NEXT: call void @g3(ptr align 64 [[S1]]) #[[ATTR0]] ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr [[S1]]) ; CHECK-NEXT: ret void ; @@ -131,7 +131,7 @@ define i32 @test4() nounwind { ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 64 -; CHECK-NEXT: call void @g3(ptr [[S]]) #[[ATTR0]] +; CHECK-NEXT: call void @g3(ptr align 64 [[S]]) #[[ATTR0]] ; CHECK-NEXT: ret i32 4 ; entry: diff --git a/llvm/test/Transforms/InstCombine/AArch64/dmb-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/dmb-intrinsics.ll new file mode 100644 index 0000000000000000000000000000000000000000..dacdd413013658a545b3c7abbe2c594187937cca --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/dmb-intrinsics.ll @@ -0,0 +1,220 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; ARM64 dmb intrinsics + +target triple = "aarch64-unknown-linux-gnu" + +declare void @llvm.aarch64.dmb(i32) +declare void @llvm.aarch64.dsb(i32) +declare void @clobber() +declare void @pure() memory(none) willreturn nounwind +declare i32 @llvm.ctlz.i32(i32, i1) + +define void @simple() #0 { +; CHECK-LABEL: define void @simple() { +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.dmb(i32 10) + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +; dmb ish (0xb) is technically stronger than ishst (0xa) but we don't merge for now +define void @simple_nonmatching() #0 { +; CHECK-LABEL: define void @simple_nonmatching() { +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 11) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.dmb(i32 10) + call void @llvm.aarch64.dmb(i32 11) + ret void +} + +define ptr @simple_safe_instruction(ptr %p) #0 { +; CHECK-LABEL: define ptr @simple_safe_instruction( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret ptr [[RES]] +; + call void @llvm.aarch64.dmb(i32 10) + %res = getelementptr inbounds i8, ptr %p, i32 8 + call void @llvm.aarch64.dmb(i32 10) + ret ptr %res +} + +define i32 @simple_safe_intrinsic(i32 %n) #0 { +; CHECK-LABEL: define i32 @simple_safe_intrinsic( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[N]], i1 false) +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret i32 [[RES]] +; + call void @llvm.aarch64.dmb(i32 10) + %res = call i32 @llvm.ctlz.i32(i32 %n, i1 false) + call void @llvm.aarch64.dmb(i32 10) + ret i32 %res +} + +define void @simple_unsafe_intrinsic() #0 { +; CHECK-LABEL: define void @simple_unsafe_intrinsic() { +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: call void @llvm.aarch64.dsb(i32 10) +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.dmb(i32 10) + call void @llvm.aarch64.dsb(i32 10) + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +define void @simple_safe_unsafe_instruction(ptr %p) #0 { +; CHECK-LABEL: define void @simple_safe_unsafe_instruction( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: store i32 42, ptr [[P]], align 4 +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.dmb(i32 10) + store i32 42, ptr %p + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +define void @simple_safe_unsafe_call(ptr %p) #0 { +; CHECK-LABEL: define void @simple_safe_unsafe_call( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: call void @clobber() +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.dmb(i32 10) + call void @clobber() + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +define void @simple_safe_safe_call(ptr %p) #0 { +; CHECK-LABEL: define void @simple_safe_safe_call( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.dmb(i32 10) + call void @pure() + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +define void @multiple_bbs1(i1 %f) #0 { +; CHECK-LABEL: define void @multiple_bbs1( +; CHECK-SAME: i1 [[F:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]] +; CHECK: [[BB_T]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[BB_F]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; +entry: + br i1 %f, label %bb_t, label %bb_f +bb_t: + call void @llvm.aarch64.dmb(i32 10) + br label %exit +bb_f: + call void @llvm.aarch64.dmb(i32 10) + br label %exit +exit: + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +define void @multiple_bbs2(i1 %f) #0 { +; CHECK-LABEL: define void @multiple_bbs2( +; CHECK-SAME: i1 [[F:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]] +; CHECK: [[BB_T]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[BB_F]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; +entry: + br i1 %f, label %bb_t, label %bb_f +bb_t: + call void @llvm.aarch64.dmb(i32 10) + br label %exit +bb_f: + br label %exit +exit: + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +define void @multiple_bbs3(i1 %f, ptr %p) #0 { +; CHECK-LABEL: define void @multiple_bbs3( +; CHECK-SAME: i1 [[F:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]] +; CHECK: [[BB_T]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[BB_F]]: +; CHECK-NEXT: store i32 42, ptr [[P]], align 4 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; +entry: + br i1 %f, label %bb_t, label %bb_f +bb_t: + call void @llvm.aarch64.dmb(i32 10) + br label %exit +bb_f: + store i32 42, ptr %p + br label %exit +exit: + call void @llvm.aarch64.dmb(i32 10) + ret void +} + +define void @multiple_bbs_unsafe(i1 %f, ptr %p) #0 { +; CHECK-LABEL: define void @multiple_bbs_unsafe( +; CHECK-SAME: i1 [[F:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]] +; CHECK: [[BB_T]]: +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: store i32 42, ptr [[P]], align 4 +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[BB_F]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: call void @llvm.aarch64.dmb(i32 10) +; CHECK-NEXT: ret void +; +entry: + br i1 %f, label %bb_t, label %bb_f +bb_t: + call void @llvm.aarch64.dmb(i32 10) + store i32 42, ptr %p + br label %exit +bb_f: + call void @llvm.aarch64.dmb(i32 10) + br label %exit +exit: + call void @llvm.aarch64.dmb(i32 10) + ret void +} + diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll index 058847a75bde7f8a35a8bfeb13f9ff971f6cbc44..cc55c4a39a267852920bc2ac16e73560cad8aa6c 100644 --- a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll +++ b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll @@ -689,6 +689,17 @@ define i1 @sge_and_max_logical(i8 %x, i8 %y) { ret i1 %r } +define i1 @sge_and_max_logical_samesign(i8 %x, i8 %y) { +; CHECK-LABEL: @sge_and_max_logical_samesign( +; CHECK-NEXT: [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127 +; CHECK-NEXT: ret i1 [[CMPEQ]] +; + %cmp = icmp sge i8 %x, %y + %cmpeq = icmp samesign eq i8 %x, 127 + %r = select i1 %cmp, i1 %cmpeq, i1 false + ret i1 %r +} + define i1 @sge_and_max_commute(i8 %x, i8 %y) { ; CHECK-LABEL: @sge_and_max_commute( ; CHECK-NEXT: [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127 diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll index d533cc7048536c77baa972e01c25de6a63d85b39..8650b89c3b9ef3dbc584981c48c5387047f5f8b3 100644 --- a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll +++ b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll @@ -592,6 +592,19 @@ define i1 @sgt_and_min_logical(ptr %x, ptr %y) { ret i1 %r } +define i1 @sgt_and_min_logical_samesign(ptr %x, ptr %y) { +; CHECK-LABEL: @sgt_and_min_logical_samesign( +; CHECK-NEXT: [[CMPEQ:%.*]] = icmp eq ptr [[X:%.*]], null +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt ptr [[Y:%.*]], null +; CHECK-NEXT: [[R:%.*]] = and i1 [[CMPEQ]], [[TMP1]] +; CHECK-NEXT: ret i1 [[R]] +; + %cmp = icmp sgt ptr %x, %y + %cmpeq = icmp samesign eq ptr %x, null + %r = select i1 %cmp, i1 %cmpeq, i1 false + ret i1 %r +} + define i1 @sle_or_not_min(ptr %x, ptr %y) { ; CHECK-LABEL: @sle_or_not_min( ; CHECK-NEXT: [[CMPEQ:%.*]] = icmp ne ptr [[X:%.*]], null diff --git a/llvm/test/Transforms/InstCombine/call-guard.ll b/llvm/test/Transforms/InstCombine/call-guard.ll index 6b31c78118d0b85ccd5aaaf7f03a459e69036f56..bc5f319e64f7fe6db2892e49a6fdb7c256341efd 100644 --- a/llvm/test/Transforms/InstCombine/call-guard.ll +++ b/llvm/test/Transforms/InstCombine/call-guard.ll @@ -43,7 +43,7 @@ define void @test_guard_adjacent_diff_cond2(i32 %V1, i32 %V2) { ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[V1:%.*]], [[V2:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[V1]], 255 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[AND]], 129 +; CHECK-NEXT: [[C:%.*]] = icmp samesign ult i32 [[AND]], 129 ; CHECK-NEXT: [[TMP3:%.*]] = and i1 [[TMP2]], [[C]] ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[TMP3]], i32 123) [ "deopt"() ] ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/InstCombine/cast_phi.ll b/llvm/test/Transforms/InstCombine/cast_phi.ll index 99da3ac1c7c8101353c725cb10d272304932c1b6..2819b7d05f7b309eb39d8a072e36add9299e6ac6 100644 --- a/llvm/test/Transforms/InstCombine/cast_phi.ll +++ b/llvm/test/Transforms/InstCombine/cast_phi.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S | FileCheck %s +; RUN: opt < %s -passes="instcombine" -S | FileCheck %s target datalayout = "n32:64" diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll index 1dd0b17e9f46ddc30c2ed953a25fec08dbfea339..c6fee0914f0e78123cf09ca4db406bb4ea6a2245 100644 --- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll +++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll @@ -67,10 +67,10 @@ define float @clamp_float_fast_ordered_nonstrict_minmax(float %x) { ; (X < C1) ? C1 : MIN(X, C2) define float @clamp_float_fast_unordered_strict_maxmin(float %x) { ; CHECK-LABEL: @clamp_float_fast_unordered_strict_maxmin( -; CHECK-NEXT: [[CMP2_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02 -; CHECK-NEXT: [[MIN:%.*]] = select fast i1 [[CMP2_INV]], float 2.550000e+02, float [[X]] -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.maxnum.f32(float [[MIN]], float 1.000000e+00) -; CHECK-NEXT: ret float [[R1]] +; CHECK-NEXT: [[MIN:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) +; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast ult float [[X]], 1.000000e+00 +; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP1]], float 1.000000e+00, float [[MIN]] +; CHECK-NEXT: ret float [[R]] ; %cmp2 = fcmp fast ult float %x, 255.0 %min = select i1 %cmp2, float %x, float 255.0 @@ -82,10 +82,10 @@ define float @clamp_float_fast_unordered_strict_maxmin(float %x) { ; (X <= C1) ? C1 : MIN(X, C2) define float @clamp_float_fast_unordered_nonstrict_maxmin(float %x) { ; CHECK-LABEL: @clamp_float_fast_unordered_nonstrict_maxmin( -; CHECK-NEXT: [[CMP2_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02 -; CHECK-NEXT: [[MIN:%.*]] = select fast i1 [[CMP2_INV]], float 2.550000e+02, float [[X]] -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.maxnum.f32(float [[MIN]], float 1.000000e+00) -; CHECK-NEXT: ret float [[R1]] +; CHECK-NEXT: [[MIN:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) +; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast ule float [[X]], 1.000000e+00 +; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP1]], float 1.000000e+00, float [[MIN]] +; CHECK-NEXT: ret float [[R]] ; %cmp2 = fcmp fast ult float %x, 255.0 %min = select i1 %cmp2, float %x, float 255.0 @@ -97,10 +97,10 @@ define float @clamp_float_fast_unordered_nonstrict_maxmin(float %x) { ; (X > C1) ? C1 : MAX(X, C2) define float @clamp_float_fast_unordered_strict_minmax(float %x) { ; CHECK-LABEL: @clamp_float_fast_unordered_strict_minmax( -; CHECK-NEXT: [[CMP2_INV:%.*]] = fcmp fast ole float [[X:%.*]], 1.000000e+00 -; CHECK-NEXT: [[MAX:%.*]] = select fast i1 [[CMP2_INV]], float 1.000000e+00, float [[X]] -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.minnum.f32(float [[MAX]], float 2.550000e+02) -; CHECK-NEXT: ret float [[R1]] +; CHECK-NEXT: [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) +; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast ugt float [[X]], 2.550000e+02 +; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP1]], float 2.550000e+02, float [[MAX]] +; CHECK-NEXT: ret float [[R]] ; %cmp2 = fcmp fast ugt float %x, 1.0 %max = select i1 %cmp2, float %x, float 1.0 @@ -112,10 +112,10 @@ define float @clamp_float_fast_unordered_strict_minmax(float %x) { ; (X >= C1) ? C1 : MAX(X, C2) define float @clamp_float_fast_unordered_nonstrict_minmax(float %x) { ; CHECK-LABEL: @clamp_float_fast_unordered_nonstrict_minmax( -; CHECK-NEXT: [[CMP2_INV:%.*]] = fcmp fast ole float [[X:%.*]], 1.000000e+00 -; CHECK-NEXT: [[MAX:%.*]] = select fast i1 [[CMP2_INV]], float 1.000000e+00, float [[X]] -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.minnum.f32(float [[MAX]], float 2.550000e+02) -; CHECK-NEXT: ret float [[R1]] +; CHECK-NEXT: [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) +; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast uge float [[X]], 2.550000e+02 +; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP1]], float 2.550000e+02, float [[MAX]] +; CHECK-NEXT: ret float [[R]] ; %cmp2 = fcmp fast ugt float %x, 1.0 %max = select i1 %cmp2, float %x, float 1.0 @@ -127,13 +127,12 @@ define float @clamp_float_fast_unordered_nonstrict_minmax(float %x) { ; Some more checks with fast ; (X > 1.0) ? min(x, 255.0) : 1.0 -; That did not match because select was in inverse order. define float @clamp_test_1(float %x) { ; CHECK-LABEL: @clamp_test_1( -; CHECK-NEXT: [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02 -; CHECK-NEXT: [[INNER_SEL:%.*]] = select fast i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]] -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.maxnum.f32(float [[INNER_SEL]], float 1.000000e+00) -; CHECK-NEXT: ret float [[R1]] +; CHECK-NEXT: [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) +; CHECK-NEXT: [[OUTER_CMP:%.*]] = fcmp fast ugt float [[X]], 1.000000e+00 +; CHECK-NEXT: [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00 +; CHECK-NEXT: ret float [[R]] ; %inner_cmp = fcmp fast ult float %x, 255.0 %inner_sel = select i1 %inner_cmp, float %x, float 255.0 @@ -147,8 +146,7 @@ define float @clamp_test_1(float %x) { ; Like @clamp_test_1 but HighConst < LowConst define float @clamp_negative_wrong_const(float %x) { ; CHECK-LABEL: @clamp_negative_wrong_const( -; CHECK-NEXT: [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02 -; CHECK-NEXT: [[INNER_SEL:%.*]] = select fast i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]] +; CHECK-NEXT: [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) ; CHECK-NEXT: [[OUTER_CMP:%.*]] = fcmp fast ugt float [[X]], 5.120000e+02 ; CHECK-NEXT: [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 5.120000e+02 ; CHECK-NEXT: ret float [[R]] @@ -163,8 +161,7 @@ define float @clamp_negative_wrong_const(float %x) { ; Like @clamp_test_1 but both are min define float @clamp_negative_same_op(float %x) { ; CHECK-LABEL: @clamp_negative_same_op( -; CHECK-NEXT: [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02 -; CHECK-NEXT: [[INNER_SEL:%.*]] = select fast i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]] +; CHECK-NEXT: [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) ; CHECK-NEXT: [[OUTER_CMP:%.*]] = fcmp fast ult float [[X]], 1.000000e+00 ; CHECK-NEXT: [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00 ; CHECK-NEXT: ret float [[R]] diff --git a/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll b/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll index 9a9f359fa80b4a2a6c32f0174e80dfe99f144290..67815e41ecd34e38ef23c5d2b8e4cc996cc4a78c 100644 --- a/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll +++ b/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll @@ -386,7 +386,7 @@ define i1 @cttz_ugt_other_multiuse_i33(i33 %x, ptr %p) { ; CHECK-LABEL: @cttz_ugt_other_multiuse_i33( ; CHECK-NEXT: [[TZ:%.*]] = tail call range(i33 0, 34) i33 @llvm.cttz.i33(i33 [[X:%.*]], i1 false) ; CHECK-NEXT: store i33 [[TZ]], ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i33 [[TZ]], 16 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i33 [[TZ]], 16 ; CHECK-NEXT: ret i1 [[CMP]] ; %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false) @@ -430,7 +430,7 @@ define <2 x i1> @cttz_ult_other_multiuse_v2i32(<2 x i32> %x, ptr %p) { ; CHECK-LABEL: @cttz_ult_other_multiuse_v2i32( ; CHECK-NEXT: [[TZ:%.*]] = tail call range(i32 0, 33) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[X:%.*]], i1 false) ; CHECK-NEXT: store <2 x i32> [[TZ]], ptr [[P:%.*]], align 8 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i32> [[TZ]], +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult <2 x i32> [[TZ]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %tz = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %x, i1 false) diff --git a/llvm/test/Transforms/InstCombine/eq-of-parts.ll b/llvm/test/Transforms/InstCombine/eq-of-parts.ll index 217e37b85933949fa4e706de2528b1b0dde4d1c8..afe5d6af1fcd493c1e1c05f00b60a2804d979029 100644 --- a/llvm/test/Transforms/InstCombine/eq-of-parts.ll +++ b/llvm/test/Transforms/InstCombine/eq-of-parts.ll @@ -1438,3 +1438,103 @@ define i1 @ne_optimized_highbits_cmp_todo_overlapping(i32 %x, i32 %y) { %r = or i1 %cmp_hi, %cmp_lo ret i1 %r } + +define i1 @and_trunc_i1(i8 %a1, i8 %a2) { +; CHECK-LABEL: @and_trunc_i1( +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[XOR]], 2 +; CHECK-NEXT: [[LOBIT:%.*]] = trunc i8 [[XOR]] to i1 +; CHECK-NEXT: [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]] +; CHECK-NEXT: ret i1 [[AND]] +; + %xor = xor i8 %a1, %a2 + %cmp = icmp ult i8 %xor, 2 + %lobit = trunc i8 %xor to i1 + %lobit.inv = xor i1 %lobit, true + %and = and i1 %cmp, %lobit.inv + ret i1 %and +} + +define i1 @and_trunc_i1_wrong_const(i8 %a1, i8 %a2) { +; CHECK-LABEL: @and_trunc_i1_wrong_const( +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[XOR]], 4 +; CHECK-NEXT: [[LOBIT:%.*]] = trunc i8 [[XOR]] to i1 +; CHECK-NEXT: [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]] +; CHECK-NEXT: ret i1 [[AND]] +; + %xor = xor i8 %a1, %a2 + %cmp = icmp ult i8 %xor, 4 + %lobit = trunc i8 %xor to i1 + %lobit.inv = xor i1 %lobit, true + %and = and i1 %cmp, %lobit.inv + ret i1 %and +} + +define i1 @and_trunc_i1_wrong_operands(i8 %a1, i8 %a2, i8 %a3) { +; CHECK-LABEL: @and_trunc_i1_wrong_operands( +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[XOR]], 2 +; CHECK-NEXT: [[XOR2:%.*]] = xor i8 [[A1]], [[A3:%.*]] +; CHECK-NEXT: [[LOBIT:%.*]] = trunc i8 [[XOR2]] to i1 +; CHECK-NEXT: [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]] +; CHECK-NEXT: ret i1 [[AND]] +; + %xor = xor i8 %a1, %a2 + %cmp = icmp ult i8 %xor, 2 + %xor2 = xor i8 %a1, %a3 + %lobit = trunc i8 %xor2 to i1 + %lobit.inv = xor i1 %lobit, true + %and = and i1 %cmp, %lobit.inv + ret i1 %and +} + +define i1 @or_trunc_i1(i64 %a1, i64 %a2) { +; CHECK-LABEL: @or_trunc_i1( +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[XOR]], 1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[XOR]] to i1 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]] +; CHECK-NEXT: ret i1 [[OR]] +; + %xor = xor i64 %a2, %a1 + %cmp = icmp ugt i64 %xor, 1 + %trunc = trunc i64 %xor to i1 + %or = or i1 %cmp, %trunc + ret i1 %or +} + +define i1 @or_trunc_i1_wrong_const(i64 %a1, i64 %a2) { +; CHECK-LABEL: @or_trunc_i1_wrong_const( +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[XOR]], 2 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[XOR]] to i1 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]] +; CHECK-NEXT: ret i1 [[OR]] +; + %xor = xor i64 %a2, %a1 + %cmp = icmp ugt i64 %xor, 2 + %trunc = trunc i64 %xor to i1 + %or = or i1 %cmp, %trunc + ret i1 %or +} + +define i1 @or_trunc_i1_wrong_operands(i64 %a1, i64 %a2, i64 %a3) { +; CHECK-LABEL: @or_trunc_i1_wrong_operands( +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[XOR]], 1 +; CHECK-NEXT: [[XOR2:%.*]] = xor i64 [[A3:%.*]], [[A1]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[XOR2]] to i1 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]] +; CHECK-NEXT: ret i1 [[OR]] +; + %xor = xor i64 %a2, %a1 + %cmp = icmp ugt i64 %xor, 1 + %xor2 = xor i64 %a3, %a1 + %trunc = trunc i64 %xor2 to i1 + %or = or i1 %cmp, %trunc + ret i1 %or +} diff --git a/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll b/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll index 4626d19bd2899ddde2579fa4ba93178191f28d66..6d5fde364c23531571fe05f18436feac36a8e930 100644 --- a/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll +++ b/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll @@ -160,7 +160,7 @@ define i1 @fold_cmp_ult_ctpop_c(i8 %x, i8 %y, i1 %cond) { ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 -16, [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND:%.*]], i8 [[X:%.*]], i8 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP2]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP3]], 3 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i8 [[TMP3]], 3 ; CHECK-NEXT: ret i1 [[R]] ; %nx = xor i8 %x, -1 @@ -176,7 +176,7 @@ define i1 @fold_cmp_sle_ctpop_c(i8 %x, i8 %y, i1 %cond) { ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 -16, [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND:%.*]], i8 [[X:%.*]], i8 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP2]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP3]], 4 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i8 [[TMP3]], 4 ; CHECK-NEXT: ret i1 [[R]] ; %nx = xor i8 %x, -1 @@ -191,7 +191,7 @@ define i1 @fold_cmp_ult_ctpop_c_no_not_inst_save_fail(i8 %x) { ; CHECK-LABEL: @fold_cmp_ult_ctpop_c_no_not_inst_save_fail( ; CHECK-NEXT: [[NX:%.*]] = xor i8 [[X:%.*]], -2 ; CHECK-NEXT: [[CNT:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[NX]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[CNT]], 5 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i8 [[CNT]], 5 ; CHECK-NEXT: ret i1 [[R]] ; %nx = xor i8 %x, -2 @@ -203,7 +203,7 @@ define i1 @fold_cmp_ult_ctpop_c_no_not_inst_save_fail(i8 %x) { define <2 x i1> @fold_cmp_ugt_ctpop_c(<2 x i8> %x) { ; CHECK-LABEL: @fold_cmp_ugt_ctpop_c( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult <2 x i8> [[TMP1]], +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; %nx = xor <2 x i8> %x, @@ -216,7 +216,7 @@ define <2 x i1> @fold_cmp_ugt_ctpop_c_out_of_range_fail(<2 x i8> %x) { ; CHECK-LABEL: @fold_cmp_ugt_ctpop_c_out_of_range_fail( ; CHECK-NEXT: [[NX:%.*]] = xor <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[CNT:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[NX]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt <2 x i8> [[CNT]], +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt <2 x i8> [[CNT]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; %nx = xor <2 x i8> %x, diff --git a/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll b/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll index 17e51e73201b168a240c55cf41115409cc636dd3..656673e7cb20ea5075aefee0c8f60f4a0110b7e8 100644 --- a/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll +++ b/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll @@ -118,7 +118,7 @@ define i32 @log2_ceil_idiom_x_may_be_zero(i32 %x) { ; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 false) ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[CTLZ]], 31 ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i32 [[RET]] @@ -139,7 +139,7 @@ define i4 @log2_ceil_idiom_trunc_too_short(i32 %x) { ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[CTLZ]] to i4 ; CHECK-NEXT: [[XOR:%.*]] = xor i4 [[TRUNC]], -1 ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i4 ; CHECK-NEXT: [[RET:%.*]] = add i4 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i4 [[RET]] @@ -160,7 +160,7 @@ define i32 @log2_ceil_idiom_mismatched_operands(i32 %x, i32 %y) { ; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 true) ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[CTLZ]], 31 ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i32 [[RET]] @@ -180,7 +180,7 @@ define i32 @log2_ceil_idiom_wrong_constant(i32 %x) { ; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 true) ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[CTLZ]], 30 ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i32 [[RET]] @@ -220,7 +220,7 @@ define i32 @log2_ceil_idiom_not_a_power2_test2(i32 %x) { ; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 true) ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[CTLZ]], 31 ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 2 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i32 [[RET]] @@ -241,7 +241,7 @@ define i32 @log2_ceil_idiom_multiuse2(i32 %x) { ; CHECK-NEXT: call void @use32(i32 [[CTLZ]]) ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[CTLZ]], 31 ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i32 [[RET]] @@ -263,7 +263,7 @@ define i32 @log2_ceil_idiom_multiuse3(i32 %x) { ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[CTLZ]], 31 ; CHECK-NEXT: call void @use32(i32 [[XOR]]) ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i32 [[RET]] @@ -286,7 +286,7 @@ define i5 @log2_ceil_idiom_trunc_multiuse4(i32 %x) { ; CHECK-NEXT: call void @use5(i5 [[TRUNC]]) ; CHECK-NEXT: [[XOR:%.*]] = xor i5 [[TRUNC]], -1 ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i5 ; CHECK-NEXT: [[RET:%.*]] = add i5 [[XOR]], [[ZEXT]] ; CHECK-NEXT: ret i5 [[RET]] @@ -310,7 +310,7 @@ define i64 @log2_ceil_idiom_zext_multiuse5(i32 %x) { ; CHECK-NEXT: [[EXT:%.*]] = zext nneg i32 [[XOR]] to i64 ; CHECK-NEXT: call void @use64(i64 [[EXT]]) ; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i64 ; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i64 [[EXT]], [[ZEXT]] ; CHECK-NEXT: ret i64 [[RET]] diff --git a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll index b2bc1abeaba568b654a8becbe7a4485f3b52b571..1cb7cf99bea325d9c84fd78b5b67dc1edfa4168b 100644 --- a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll +++ b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll @@ -217,7 +217,7 @@ define float @gep_cross_loop(ptr %_arg_, ptr %_arg_3, float %_arg_8) { ; CHECK: for.cond.i: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD11_I:%.*]], [[FOR_BODY_I:%.*]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I:%.*]], [[FOR_BODY_I]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 17 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[IDX]], 17 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_I]], label [[FOR_COND_I_I_I_PREHEADER:%.*]] ; CHECK: for.cond.i.i.i.preheader: ; CHECK-NEXT: ret float [[SUM]] diff --git a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll index 684ece21b1166e72cbbeb327e2c0906e3163b1bd..d092363309fec023f4d80e0a535957466ef37d63 100644 --- a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll +++ b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll @@ -619,6 +619,19 @@ define i1 @test_shr_and_1_ne_0(i32 %a, i32 %b) { ret i1 %cmp } +define i1 @test_shr_and_1_ne_0_samesign(i32 %a, i32 %b) { +; CHECK-LABEL: @test_shr_and_1_ne_0_samesign( +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[TMP2]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shr = lshr i32 %a, %b + %and = and i32 %shr, 1 + %cmp = icmp samesign ne i32 %and, 0 + ret i1 %cmp +} + define i1 @test_const_shr_and_1_ne_0(i32 %b) { ; CHECK-LABEL: @test_const_shr_and_1_ne_0( ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]] diff --git a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll index c2740ca7fe8aa9d2f9df26bfd29e73d2aa6e2cfb..b9d8f2d54def3d35c301c0fbd2fdc004a581b8e9 100644 --- a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll +++ b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll @@ -33,6 +33,22 @@ entry: ret i1 %equal } +define i1 @icmp_equality_test_constant_samesign(i42 %X, i42 %Y) { +; CHECK-LABEL: @icmp_equality_test_constant_samesign( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[XEQY:%.*]] = icmp eq i42 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[XEQY]] +; +entry: + %XeqC = icmp eq i42 %X, -42 + %YeqC = icmp eq i42 %Y, -42 + %XeqY = icmp samesign eq i42 %X, %Y + %not.YeqC = xor i1 %YeqC, true + %and = select i1 %not.YeqC, i1 %XeqY, i1 false + %equal = select i1 %XeqC, i1 %YeqC, i1 %and + ret i1 %equal +} + define i1 @icmp_equality_test_swift_optional_pointers(i64 %X, i64 %Y) { ; CHECK-LABEL: @icmp_equality_test_swift_optional_pointers( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll index 07536f271ceb1905d18c1cd121005f01cea3baa0..653b818f7eb5c453ea7e47cb8f763837b80c1c44 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll @@ -13,7 +13,7 @@ define i32 @sterix(i32, i8, i64) { ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[MUL]], [[SH_PROM]] ; CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[SHR]] to i64 ; CHECK-NEXT: [[MUL3:%.*]] = mul nuw nsw i64 [[CONV]], [[CONV2]] -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ult i64 [[MUL3]], 4294967296 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp samesign ult i64 [[MUL3]], 4294967296 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[LOR_RHS:%.*]], label [[LOR_END:%.*]] ; CHECK: lor.rhs: ; CHECK-NEXT: [[AND:%.*]] = and i64 [[TMP2]], [[MUL3]] diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll index a14f342ae2482baf41442f3b1f7d2dc78cadd96d..c4543c9deef38830d1df2fd7b2fa14735ecd02eb 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll @@ -849,7 +849,7 @@ define i1 @not_mul_of_bool(i32 %x, i8 %y) { ; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 3 ; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32 ; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Q]], [[Z]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[M]], 255 ; CHECK-NEXT: ret i1 [[R]] ; %q = and i32 %x, 3 @@ -866,7 +866,7 @@ define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) { ; CHECK-NEXT: [[X30:%.*]] = lshr i32 [[X:%.*]], 30 ; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255 ; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X30]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[M]], 255 ; CHECK-NEXT: ret i1 [[R]] ; %x30 = lshr i32 %x, 30 @@ -935,7 +935,7 @@ define i1 @not_mul_of_pow2(i32 %x, i8 %y) { ; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 6 ; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32 ; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Q]], [[Z]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 1530 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[M]], 1530 ; CHECK-NEXT: ret i1 [[R]] ; %q = and i32 %x, 6 @@ -952,7 +952,7 @@ define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) { ; CHECK-NEXT: [[X30:%.*]] = and i32 [[X:%.*]], 12 ; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255 ; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X30]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 3060 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[M]], 3060 ; CHECK-NEXT: ret i1 [[R]] ; %x30 = and i32 %x, 12 diff --git a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll index 618f5d641dc1abbaf5e3ab7c03ab60b163352d51..b19909a234481188977de150ed4c66ae42acef42 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll @@ -350,7 +350,7 @@ define i32 @not_pow2_32_nonconst_assume(i32 %x, i32 %y) { define i32 @pow2_or_zero_32_nonconst_assume(i32 %x, i32 %y) { ; CHECK-LABEL: @pow2_or_zero_32_nonconst_assume( ; CHECK-NEXT: [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]]) -; CHECK-NEXT: [[YP2:%.*]] = icmp ult i32 [[CTPOP]], 2 +; CHECK-NEXT: [[YP2:%.*]] = icmp samesign ult i32 [[CTPOP]], 2 ; CHECK-NEXT: call void @llvm.assume(i1 [[YP2]]) ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 @@ -426,7 +426,7 @@ False: define i32 @pow2_or_zero_32_nonconst_assume_br(i32 %x, i32 %y) { ; CHECK-LABEL: @pow2_or_zero_32_nonconst_assume_br( ; CHECK-NEXT: [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]]) -; CHECK-NEXT: [[YP2:%.*]] = icmp ult i32 [[CTPOP]], 2 +; CHECK-NEXT: [[YP2:%.*]] = icmp samesign ult i32 [[CTPOP]], 2 ; CHECK-NEXT: call void @llvm.assume(i1 [[YP2]]) ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y]] ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[AND]], 0 diff --git a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll index f2a02fac90b17c731437db9243b53929b7b74d93..2c2de5dbf09f6e5959d8a68cdbe7582a2f253055 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll @@ -70,7 +70,7 @@ define i1 @icmp_trunc_x_trunc_y_2_illegal_anyways(i33 %x, i63 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i33 [[X]] to i63 -; CHECK-NEXT: [[R:%.*]] = icmp ult i63 [[Y]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i63 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i33 %x, 512 @@ -90,7 +90,7 @@ define i1 @icmp_trunc_x_trunc_y_3(i64 %x, i32 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i64 [[X]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[Y]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign ule i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i64 %x, 123 @@ -152,7 +152,7 @@ define i1 @icmp_trunc_x_trunc_y_swap0(i33 %x, i32 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i33 [[X]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp uge i32 [[Y]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign uge i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i33 %x, 65536 @@ -172,7 +172,7 @@ define i1 @icmp_trunc_x_trunc_y_swap1(i33 %x, i32 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i33 [[X]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[Y]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign ule i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i33 %x, 65536 @@ -190,7 +190,7 @@ define i1 @icmp_trunc_x_zext_y(i32 %x, i8 %y) { ; CHECK-NEXT: [[X_LB_ONLY:%.*]] = icmp ult i32 [[X:%.*]], 65536 ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[X]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i32 %x, 65536 @@ -206,7 +206,7 @@ define i1 @icmp_trunc_x_zext_y_2(i32 %x, i8 %y) { ; CHECK-NEXT: [[X_LB_ONLY:%.*]] = icmp ult i32 [[X:%.*]], 65536 ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[X]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign ule i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i32 %x, 65536 diff --git a/llvm/test/Transforms/InstCombine/icmp-range.ll b/llvm/test/Transforms/InstCombine/icmp-range.ll index 8b690826a7bf9a396765b94fc23fc7f8fb910cc1..2db5bad17b1977f176c526a039f720a961d06864 100644 --- a/llvm/test/Transforms/InstCombine/icmp-range.ll +++ b/llvm/test/Transforms/InstCombine/icmp-range.ll @@ -140,7 +140,7 @@ define i1 @test_two_ranges(ptr nocapture readonly %arg1, ptr nocapture readonly ; CHECK-LABEL: @test_two_ranges( ; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr [[ARG1:%.*]], align 4, !range [[RNG4:![0-9]+]] ; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr [[ARG2:%.*]], align 4, !range [[RNG5:![0-9]+]] -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] +; CHECK-NEXT: [[RVAL:%.*]] = icmp samesign ult i32 [[VAL2]], [[VAL1]] ; CHECK-NEXT: ret i1 [[RVAL]] ; %val1 = load i32, ptr %arg1, !range !5 @@ -152,7 +152,7 @@ define i1 @test_two_ranges(ptr nocapture readonly %arg1, ptr nocapture readonly ; Values' ranges overlap each other, so it can not be simplified. define i1 @test_two_attribute_ranges(i32 range(i32 5, 10) %arg1, i32 range(i32 8, 16) %arg2) { ; CHECK-LABEL: @test_two_attribute_ranges( -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult i32 [[ARG2:%.*]], [[ARG1:%.*]] +; CHECK-NEXT: [[RVAL:%.*]] = icmp samesign ult i32 [[ARG2:%.*]], [[ARG1:%.*]] ; CHECK-NEXT: ret i1 [[RVAL]] ; %rval = icmp ult i32 %arg2, %arg1 @@ -215,7 +215,7 @@ define <2 x i1> @test_two_ranges_vec(ptr nocapture readonly %arg1, ptr nocapture ; CHECK-LABEL: @test_two_ranges_vec( ; CHECK-NEXT: [[VAL1:%.*]] = load <2 x i32>, ptr [[ARG1:%.*]], align 8, !range [[RNG4]] ; CHECK-NEXT: [[VAL2:%.*]] = load <2 x i32>, ptr [[ARG2:%.*]], align 8, !range [[RNG5]] -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult <2 x i32> [[VAL2]], [[VAL1]] +; CHECK-NEXT: [[RVAL:%.*]] = icmp samesign ult <2 x i32> [[VAL2]], [[VAL1]] ; CHECK-NEXT: ret <2 x i1> [[RVAL]] ; %val1 = load <2 x i32>, ptr %arg1, !range !5 @@ -249,7 +249,7 @@ define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr noca ; Values' ranges overlap each other, so it can not be simplified. define <2 x i1> @test_two_argument_ranges_vec(<2 x i32> range(i32 5, 10) %arg1, <2 x i32> range(i32 8, 16) %arg2) { ; CHECK-LABEL: @test_two_argument_ranges_vec( -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult <2 x i32> [[ARG2:%.*]], [[ARG1:%.*]] +; CHECK-NEXT: [[RVAL:%.*]] = icmp samesign ult <2 x i32> [[ARG2:%.*]], [[ARG1:%.*]] ; CHECK-NEXT: ret <2 x i1> [[RVAL]] ; %rval = icmp ult <2 x i32> %arg2, %arg1 @@ -283,7 +283,7 @@ define i1 @test_two_return_attribute_ranges_not_simplified() { ; CHECK-LABEL: @test_two_return_attribute_ranges_not_simplified( ; CHECK-NEXT: [[VAL1:%.*]] = call range(i32 5, 10) i32 @create_range1() ; CHECK-NEXT: [[VAL2:%.*]] = call i32 @create_range2() -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] +; CHECK-NEXT: [[RVAL:%.*]] = icmp samesign ult i32 [[VAL2]], [[VAL1]] ; CHECK-NEXT: ret i1 [[RVAL]] ; %val1 = call range(i32 5, 10) i32 @create_range1() diff --git a/llvm/test/Transforms/InstCombine/icmp-shr.ll b/llvm/test/Transforms/InstCombine/icmp-shr.ll index 71b4f5a970c2f62e0475648e352958a540adc363..bdcba9ed1549b96ff2ba12d2ec3d3766d8765f3d 100644 --- a/llvm/test/Transforms/InstCombine/icmp-shr.ll +++ b/llvm/test/Transforms/InstCombine/icmp-shr.ll @@ -1072,7 +1072,7 @@ define <2 x i1> @lshr_pow2_ugt_vec(<2 x i8> %x) { define i1 @lshr_not_pow2_ugt(i8 %x) { ; CHECK-LABEL: @lshr_not_pow2_ugt( ; CHECK-NEXT: [[S:%.*]] = lshr i8 3, [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[S]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i8 [[S]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %s = lshr i8 3, %x @@ -1095,7 +1095,7 @@ define i1 @lshr_pow2_ugt1(i8 %x) { define i1 @ashr_pow2_ugt(i8 %x) { ; CHECK-LABEL: @ashr_pow2_ugt( ; CHECK-NEXT: [[S:%.*]] = ashr exact i8 -128, [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[S]], -96 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i8 [[S]], -96 ; CHECK-NEXT: ret i1 [[R]] ; %s = ashr i8 128, %x @@ -1154,7 +1154,7 @@ define <2 x i1> @lshr_pow2_ult_vec(<2 x i8> %x) { define i1 @lshr_not_pow2_ult(i8 %x) { ; CHECK-LABEL: @lshr_not_pow2_ult( ; CHECK-NEXT: [[S:%.*]] = lshr i8 3, [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[S]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i8 [[S]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %s = lshr i8 3, %x @@ -1187,7 +1187,7 @@ define i1 @lshr_pow2_ult_smin(i8 %x) { define i1 @ashr_pow2_ult(i8 %x) { ; CHECK-LABEL: @ashr_pow2_ult( ; CHECK-NEXT: [[S:%.*]] = ashr exact i8 -128, [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[S]], -96 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i8 [[S]], -96 ; CHECK-NEXT: ret i1 [[R]] ; %s = ashr i8 128, %x @@ -1631,7 +1631,7 @@ define i1 @slt_zero_ult_i1_fail1(i32 %a, i1 %b) { ; CHECK-LABEL: @slt_zero_ult_i1_fail1( ; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[B:%.*]] to i32 ; CHECK-NEXT: [[CMP1:%.*]] = lshr i32 [[A:%.*]], 30 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[CMP1]], [[CONV]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ugt i32 [[CMP1]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %conv = zext i1 %b to i32 diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll index ba47ed02edbdf408236bdd0385b42199dbe56901..c185e632b5194f1ebb515b1a9047e527adf1d022 100644 --- a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll @@ -101,7 +101,7 @@ define i1 @both(i8 %bits0, i8 %bits1) { ; CHECK-LABEL: @both( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS0:%.*]] ; CHECK-NEXT: [[T2:%.*]] = shl nsw i8 -1, [[BITS1:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[T0]], [[T2]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign ule i8 [[T0]], [[T2]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits0 diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll index 37aa85202e5622e04ad9fb2df217264494c2f9da..c1912e11b93abfed4c0f0528993e2ae1251b8587 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll @@ -101,7 +101,7 @@ define i1 @both(i8 %bits0, i8 %bits1) { ; CHECK-LABEL: @both( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS0:%.*]] ; CHECK-NEXT: [[T2:%.*]] = shl nsw i8 -1, [[BITS1:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[T0]], [[T2]] +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i8 [[T0]], [[T2]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits0 diff --git a/llvm/test/Transforms/InstCombine/icmp-vscale.ll b/llvm/test/Transforms/InstCombine/icmp-vscale.ll index ae1be58938aa3329bdfaef3705a089f1b93cddee..a13e90456d6395a4bdbd9b58b32304bab0c879e3 100644 --- a/llvm/test/Transforms/InstCombine/icmp-vscale.ll +++ b/llvm/test/Transforms/InstCombine/icmp-vscale.ll @@ -94,7 +94,7 @@ define i1 @vscale_ule_max() vscale_range(4,8) { define i1 @vscale_ult_max() vscale_range(4,8) { ; CHECK-LABEL: @vscale_ult_max( ; CHECK-NEXT: [[VSCALE:%.*]] = call i16 @llvm.vscale.i16() -; CHECK-NEXT: [[RES:%.*]] = icmp ult i16 [[VSCALE]], 8 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign ult i16 [[VSCALE]], 8 ; CHECK-NEXT: ret i1 [[RES]] ; %vscale = call i16 @llvm.vscale.i16() @@ -114,7 +114,7 @@ define i1 @vscale_uge_min() vscale_range(4,8) { define i1 @vscale_ugt_min() vscale_range(4,8) { ; CHECK-LABEL: @vscale_ugt_min( ; CHECK-NEXT: [[VSCALE:%.*]] = call i16 @llvm.vscale.i16() -; CHECK-NEXT: [[RES:%.*]] = icmp ugt i16 [[VSCALE]], 4 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign ugt i16 [[VSCALE]], 4 ; CHECK-NEXT: ret i1 [[RES]] ; %vscale = call i16 @llvm.vscale.i16() diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll index 5e80134b153be7ec5a2636dda2be53a97fa170c4..c695dc1cd69c8b28922bdf4d58aefcd435f33e60 100644 --- a/llvm/test/Transforms/InstCombine/icmp.ll +++ b/llvm/test/Transforms/InstCombine/icmp.ll @@ -1457,7 +1457,7 @@ define <2 x i1> @test67vecinverse(<2 x i32> %x) { define i1 @test68(i32 %x) { ; CHECK-LABEL: @test68( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 127 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[AND]], 30 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[AND]], 30 ; CHECK-NEXT: ret i1 [[CMP]] ; %and = and i32 %x, 127 @@ -2213,7 +2213,7 @@ define i1 @icmp_and_ashr_mixed_and_shiftout(i8 %x) { ; CHECK-LABEL: @icmp_and_ashr_mixed_and_shiftout( ; CHECK-NEXT: [[ASHR:%.*]] = ashr i8 [[X:%.*]], 4 ; CHECK-NEXT: [[AND:%.*]] = and i8 [[ASHR]], 31 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[AND]], 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i8 [[AND]], 8 ; CHECK-NEXT: ret i1 [[CMP]] ; %ashr = ashr i8 %x, 4 @@ -3203,6 +3203,21 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) { ret i1 %ret } +define i1 @icmp_and_or_lshr_samesign(i32 %x, i32 %y) { +; CHECK-LABEL: @icmp_and_or_lshr_samesign( +; CHECK-NEXT: [[SHF1:%.*]] = shl nuw i32 1, [[Y:%.*]] +; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHF1]], 1 +; CHECK-NEXT: [[AND3:%.*]] = and i32 [[X:%.*]], [[OR2]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne i32 [[AND3]], 0 +; CHECK-NEXT: ret i1 [[RET]] +; + %shf = lshr i32 %x, %y + %or = or i32 %shf, %x + %and = and i32 %or, 1 + %ret = icmp samesign ne i32 %and, 0 + ret i1 %ret +} + define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_and_or_lshr_vec( ; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]] @@ -5319,7 +5334,7 @@ define i1 @test_icmp_shl_sgt(i64 %x) { define i1 @pr94897(i32 range(i32 -2147483648, 0) %x) { ; CHECK-LABEL: @pr94897( -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], -3 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[X:%.*]], -3 ; CHECK-NEXT: ret i1 [[CMP]] ; %shl = shl nsw i32 %x, 24 @@ -5349,7 +5364,7 @@ define i1 @icmp_and_inv_pow2_ne_0(i32 %A, i32 %B) { define i1 @icmp_and_inv_pow2_or_zero_ne_0(i32 %A, i32 %B) { ; CHECK-LABEL: @icmp_and_inv_pow2_or_zero_ne_0( ; CHECK-NEXT: [[POPCNT:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[A:%.*]]) -; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[POPCNT]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ult i32 [[POPCNT]], 2 ; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) ; CHECK-NEXT: [[INV:%.*]] = xor i32 [[B:%.*]], -1 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[A]], [[INV]] diff --git a/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll b/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll index baad241f8dad99d3dc6cc24d4a12e779bb5039c8..beab2739250756200f1763d8d563240aef6cacbf 100644 --- a/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll +++ b/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll @@ -19,7 +19,7 @@ define i1 @without_range(ptr %A) { define i1 @with_range(ptr %A) { ; CHECK-LABEL: @with_range( ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A:%.*]], align 8, !range [[RNG0:![0-9]+]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[A_VAL]], 2 +; CHECK-NEXT: [[C:%.*]] = icmp samesign ult i32 [[A_VAL]], 2 ; CHECK-NEXT: ret i1 [[C]] ; %A.val = load i32, ptr %A, align 8, !range !0 diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll index 79511cf9e666ff6bc60e5f65b83897d557bd5dee..1110bbc5403e499acb7bdf6fec7ac6ec97727ec5 100644 --- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll +++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll @@ -40,7 +40,7 @@ define ptr @test1_nuw(ptr %A, i32 %Offset) { ; CHECK: bb: ; CHECK-NEXT: [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[RHS_ADD]] = add nuw nsw i32 [[RHS_IDX]], 4 -; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[RHS_IDX]], 400 +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ugt i32 [[RHS_IDX]], 400 ; CHECK-NEXT: br i1 [[COND]], label [[BB2:%.*]], label [[BB]] ; CHECK: bb2: ; CHECK-NEXT: [[RHS_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A:%.*]], i32 [[RHS_IDX]] diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll index 348508769c581fe6ba1355ceb9f7e8d4abe6b6df..7ace998556c703b66a10a6a2f8aed6ca641ffd76 100644 --- a/llvm/test/Transforms/InstCombine/ispow2.ll +++ b/llvm/test/Transforms/InstCombine/ispow2.ll @@ -4,7 +4,7 @@ define i1 @is_pow2or0_negate_op(i32 %x) { ; CHECK-LABEL: @is_pow2or0_negate_op( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[TMP1]], 2 ; CHECK-NEXT: ret i1 [[CMP]] ; %neg = sub i32 0, %x @@ -16,7 +16,7 @@ define i1 @is_pow2or0_negate_op(i32 %x) { define <2 x i1> @is_pow2or0_negate_op_vec(<2 x i32> %x) { ; CHECK-LABEL: @is_pow2or0_negate_op_vec( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i32> [[TMP1]], +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %neg = sub <2 x i32> zeroinitializer, %x @@ -28,7 +28,7 @@ define <2 x i1> @is_pow2or0_negate_op_vec(<2 x i32> %x) { define i1 @is_pow2or0_decrement_op(i8 %x) { ; CHECK-LABEL: @is_pow2or0_decrement_op( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[TMP1]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i8 [[TMP1]], 2 ; CHECK-NEXT: ret i1 [[CMP]] ; %dec = add i8 %x, -1 @@ -40,7 +40,7 @@ define i1 @is_pow2or0_decrement_op(i8 %x) { define <2 x i1> @is_pow2or0_decrement_op_vec(<2 x i8> %x) { ; CHECK-LABEL: @is_pow2or0_decrement_op_vec( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i8> [[TMP1]], +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %dec = add <2 x i8> %x, @@ -52,7 +52,7 @@ define <2 x i1> @is_pow2or0_decrement_op_vec(<2 x i8> %x) { define i1 @isnot_pow2or0_negate_op(i32 %x) { ; CHECK-LABEL: @isnot_pow2or0_negate_op( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %neg = sub i32 0, %x @@ -64,7 +64,7 @@ define i1 @isnot_pow2or0_negate_op(i32 %x) { define <2 x i1> @isnot_pow2or0_negate_op_vec(<2 x i32> %x) { ; CHECK-LABEL: @isnot_pow2or0_negate_op_vec( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt <2 x i32> [[TMP1]], +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %neg = sub <2 x i32> zeroinitializer, %x @@ -76,7 +76,7 @@ define <2 x i1> @isnot_pow2or0_negate_op_vec(<2 x i32> %x) { define i1 @isnot_pow2or0_decrement_op(i8 %x) { ; CHECK-LABEL: @isnot_pow2or0_decrement_op( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[TMP1]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i8 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %dec = add i8 %x, -1 @@ -88,7 +88,7 @@ define i1 @isnot_pow2or0_decrement_op(i8 %x) { define <2 x i1> @isnot_pow2or0_decrement_op_vec(<2 x i8> %x) { ; CHECK-LABEL: @isnot_pow2or0_decrement_op_vec( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt <2 x i8> [[TMP1]], +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %dec = add <2 x i8> %x, @@ -101,7 +101,7 @@ define i1 @is_pow2or0_negate_op_commute1(i32 %p) { ; CHECK-LABEL: @is_pow2or0_negate_op_commute1( ; CHECK-NEXT: [[X:%.*]] = srem i32 42, [[P:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 7) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[TMP1]], 2 ; CHECK-NEXT: ret i1 [[CMP]] ; %x = srem i32 42, %p ; thwart complexity-based canonicalization @@ -117,7 +117,7 @@ define i1 @isnot_pow2or0_negate_op_commute2(i32 %p) { ; CHECK-LABEL: @isnot_pow2or0_negate_op_commute2( ; CHECK-NEXT: [[X:%.*]] = urem i32 42, [[P:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 7) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %x = urem i32 42, %p ; thwart complexity-based canonicalization @@ -131,7 +131,7 @@ define i1 @isnot_pow2or0_negate_op_commute3(i32 %p) { ; CHECK-LABEL: @isnot_pow2or0_negate_op_commute3( ; CHECK-NEXT: [[X:%.*]] = urem i32 42, [[P:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 7) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %x = urem i32 42, %p ; thwart complexity-based canonicalization @@ -148,7 +148,7 @@ define i1 @is_pow2or0_negate_op_extra_use1(i32 %x) { ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] ; CHECK-NEXT: call void @use(i32 [[NEG]]) ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[TMP1]], 2 ; CHECK-NEXT: ret i1 [[CMP]] ; %neg = sub i32 0, %x @@ -198,7 +198,7 @@ define i1 @is_pow2_non_zero_ult_2(i32 %x) { ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[NOTZERO]]) ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[CMP]] ; %notzero = icmp ne i32 %x, 0 @@ -228,7 +228,7 @@ define i1 @is_pow2_non_zero_ugt_1(i32 %x) { ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[NOTZERO]]) ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: ret i1 [[CMP]] ; %notzero = icmp ne i32 %x, 0 @@ -272,7 +272,7 @@ declare void @use_i1(i1) define i1 @is_pow2_ctpop_extra_uses(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_extra_uses( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[NOTZERO]]) @@ -291,7 +291,7 @@ define i1 @is_pow2_ctpop_extra_uses(i32 %x) { define i1 @is_pow2_ctpop_extra_uses_logical(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_extra_uses_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[NOTZERO]]) @@ -327,7 +327,7 @@ define <2 x i1> @is_pow2_ctpop_commute_vec(<2 x i8> %x) { define i1 @is_pow2_ctpop_wrong_cmp_op1(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op1( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 3 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 3 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -342,7 +342,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op1(i32 %x) { define i1 @is_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op1_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 3 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 3 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] @@ -359,7 +359,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) { define i1 @is_pow2_ctpop_wrong_cmp_op2(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op2( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 1 ; CHECK-NEXT: [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -374,7 +374,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op2(i32 %x) { define i1 @is_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op2_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 1 ; CHECK-NEXT: [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] @@ -391,7 +391,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) { define i1 @is_pow2_ctpop_wrong_pred1(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred1( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[CMP]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -404,7 +404,7 @@ define i1 @is_pow2_ctpop_wrong_pred1(i32 %x) { define i1 @is_pow2_ctpop_wrong_pred1_logical(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred1_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[CMP]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -419,7 +419,7 @@ define i1 @is_pow2_ctpop_wrong_pred1_logical(i32 %x) { define i1 @is_pow2_ctpop_wrong_pred2(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred2( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = and i1 [[CMP2]], [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -434,7 +434,7 @@ define i1 @is_pow2_ctpop_wrong_pred2(i32 %x) { define i1 @is_pow2_ctpop_wrong_pred2_logical(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred2_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP2]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] @@ -479,7 +479,7 @@ define i1 @isnot_pow2_ctpop_logical(i32 %x) { define i1 @isnot_pow2_ctpop_extra_uses(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_extra_uses( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[ISZERO]]) @@ -498,7 +498,7 @@ define i1 @isnot_pow2_ctpop_extra_uses(i32 %x) { define i1 @isnot_pow2_ctpop_extra_uses_logical(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_extra_uses_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[ISZERO]]) @@ -534,7 +534,7 @@ define <2 x i1> @isnot_pow2_ctpop_commute_vec(<2 x i8> %x) { define i1 @isnot_pow2_ctpop_wrong_cmp_op1(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op1( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = or i1 [[ISZERO]], [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -549,7 +549,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op1(i32 %x) { define i1 @isnot_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op1_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = select i1 [[ISZERO]], i1 true, i1 [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -566,7 +566,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) { define i1 @isnot_pow2_ctpop_wrong_cmp_op2(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op2( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 1 ; CHECK-NEXT: [[R:%.*]] = or i1 [[ISZERO]], [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -581,7 +581,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op2(i32 %x) { define i1 @isnot_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op2_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 1 ; CHECK-NEXT: [[R:%.*]] = select i1 [[ISZERO]], i1 true, i1 [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -598,7 +598,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) { define i1 @isnot_pow2_ctpop_wrong_pred2(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_pred2( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = or i1 [[CMP2]], [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -613,7 +613,7 @@ define i1 @isnot_pow2_ctpop_wrong_pred2(i32 %x) { define i1 @isnot_pow2_ctpop_wrong_pred2_logical(i32 %x) { ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_pred2_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[X]], 0 ; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP2]], i1 true, i1 [[CMP]] ; CHECK-NEXT: ret i1 [[R]] @@ -798,7 +798,7 @@ define <2 x i1> @isnot_pow2_decrement_op_vec(<2 x i8> %x) { define i1 @is_pow2or0_ctpop(i32 %x) { ; CHECK-LABEL: @is_pow2or0_ctpop( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -811,7 +811,7 @@ define i1 @is_pow2or0_ctpop(i32 %x) { define i1 @is_pow2or0_ctpop_swap_cmp(i32 %x) { ; CHECK-LABEL: @is_pow2or0_ctpop_swap_cmp( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -824,7 +824,7 @@ define i1 @is_pow2or0_ctpop_swap_cmp(i32 %x) { define i1 @is_pow2or0_ctpop_logical(i32 %x) { ; CHECK-LABEL: @is_pow2or0_ctpop_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -837,7 +837,7 @@ define i1 @is_pow2or0_ctpop_logical(i32 %x) { define <2 x i1> @is_pow2or0_ctpop_commute_vec(<2 x i8> %x) { ; CHECK-LABEL: @is_pow2or0_ctpop_commute_vec( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult <2 x i8> [[T0]], +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult <2 x i8> [[T0]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = tail call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %x) @@ -857,7 +857,7 @@ define i1 @is_pow2or0_ctpop_extra_uses(i32 %x) { ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[ISZERO]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -878,7 +878,7 @@ define i1 @is_pow2or0_ctpop_logical_extra_uses(i32 %x) { ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[ISZERO]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[T0]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i32 [[T0]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -991,7 +991,7 @@ define <2 x i1> @is_pow2or0_ctpop_commute_vec_wrong_pred3(<2 x i8> %x) { define i1 @isnot_pow2nor0_ctpop(i32 %x) { ; CHECK-LABEL: @isnot_pow2nor0_ctpop( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -1004,7 +1004,7 @@ define i1 @isnot_pow2nor0_ctpop(i32 %x) { define i1 @isnot_pow2nor0_ctpop_swap_cmp(i32 %x) { ; CHECK-LABEL: @isnot_pow2nor0_ctpop_swap_cmp( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -1017,7 +1017,7 @@ define i1 @isnot_pow2nor0_ctpop_swap_cmp(i32 %x) { define i1 @isnot_pow2nor0_ctpop_logical(i32 %x) { ; CHECK-LABEL: @isnot_pow2nor0_ctpop_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -1030,7 +1030,7 @@ define i1 @isnot_pow2nor0_ctpop_logical(i32 %x) { define <2 x i1> @isnot_pow2nor0_ctpop_commute_vec(<2 x i8> %x) { ; CHECK-LABEL: @isnot_pow2nor0_ctpop_commute_vec( ; CHECK-NEXT: [[T0:%.*]] = tail call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt <2 x i8> [[T0]], +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt <2 x i8> [[T0]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = tail call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %x) @@ -1050,7 +1050,7 @@ define i1 @isnot_pow2nor0_ctpop_extra_uses(i32 %x) { ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[NOTZERO]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -1071,7 +1071,7 @@ define i1 @isnot_pow2nor0_ctpop_logical_extra_uses(i32 %x) { ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 ; CHECK-NEXT: call void @use_i1(i1 [[NOTZERO]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[T0]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -1210,7 +1210,7 @@ define i1 @blsmsk_is_p2_or_z(i32 %xx, i32 %yy) { define i1 @blsmsk_isnt_p2_or_z(i32 %x) { ; CHECK-LABEL: @blsmsk_isnt_p2_or_z( ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[TMP1]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %xm1 = add i32 %x, -1 @@ -1223,7 +1223,7 @@ define i1 @blsmsk_is_p2_or_z_fail(i32 %xx, i32 %yy) { ; CHECK-LABEL: @blsmsk_is_p2_or_z_fail( ; CHECK-NEXT: [[X:%.*]] = or i32 [[XX:%.*]], [[YY:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[TMP1]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i32 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %x = or i32 %xx, %yy @@ -1308,7 +1308,7 @@ define i1 @blsmsk_is_p2_or_z_ule_xy(i8 %xx, i8 %yy) { ; CHECK-LABEL: @blsmsk_is_p2_or_z_ule_xy( ; CHECK-NEXT: [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i8 [[TMP1]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %x = or i8 %xx, %yy @@ -1339,7 +1339,7 @@ define i1 @blsmsk_is_p2_or_z_uge_yx(i8 %xx, i8 %yy) { ; CHECK-LABEL: @blsmsk_is_p2_or_z_uge_yx( ; CHECK-NEXT: [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], 2 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i8 [[TMP1]], 2 ; CHECK-NEXT: ret i1 [[R]] ; %x = or i8 %xx, %yy @@ -1369,7 +1369,7 @@ define i1 @blsmsk_isnt_p2_or_z_ugt_xy(i8 %xx, i8 %yy) { ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_ugt_xy( ; CHECK-NEXT: [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i8 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %x = or i8 %xx, %yy @@ -1400,7 +1400,7 @@ define i1 @blsmsk_isnt_p2_or_z_ult_yx(i8 %xx, i8 %yy) { ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_ult_yx( ; CHECK-NEXT: [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt i8 [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %x = or i8 %xx, %yy @@ -1506,7 +1506,7 @@ define <2 x i1> @not_pow2_or_z_known_bits_fail_wrong_cmp(<2 x i32> %xin) { ; CHECK-LABEL: @not_pow2_or_z_known_bits_fail_wrong_cmp( ; CHECK-NEXT: [[X:%.*]] = or <2 x i32> [[XIN:%.*]], ; CHECK-NEXT: [[CNT:%.*]] = call range(i32 1, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt <2 x i32> [[CNT]], +; CHECK-NEXT: [[R:%.*]] = icmp samesign ugt <2 x i32> [[CNT]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; %x = or <2 x i32> %xin, @@ -1550,7 +1550,7 @@ entry: define i1 @is_power2_or_zero_with_range(i32 %x) { ; CHECK-LABEL: @is_power2_or_zero_with_range( ; CHECK-NEXT: [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[RES:%.*]] = icmp ult i32 [[CTPOP]], 2 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign ult i32 [[CTPOP]], 2 ; CHECK-NEXT: ret i1 [[RES]] ; %ctpop = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) @@ -1563,7 +1563,7 @@ define i1 @is_power2_or_zero_with_range(i32 %x) { define i1 @is_power2_or_zero_inv_with_range(i32 %x) { ; CHECK-LABEL: @is_power2_or_zero_inv_with_range( ; CHECK-NEXT: [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) -; CHECK-NEXT: [[RES:%.*]] = icmp ugt i32 [[CTPOP]], 1 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1 ; CHECK-NEXT: ret i1 [[RES]] ; %ctpop = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) diff --git a/llvm/test/Transforms/InstCombine/load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/load-bitcast-select.ll index 4c5f94d53ada5ab14484cf43aa0cab49ec2e3c84..83a34de768f2e15aa0b8130a6703bd23be0151e1 100644 --- a/llvm/test/Transforms/InstCombine/load-bitcast-select.ll +++ b/llvm/test/Transforms/InstCombine/load-bitcast-select.ll @@ -10,7 +10,7 @@ define void @_Z3foov() { ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_0]], 1000 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 1000 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll index 89522a00d789498efd6f0875c4662f554068c737..e1c441e9c0b446e77d3d3d00b40b44d02893d7b3 100644 --- a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll +++ b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll @@ -177,7 +177,7 @@ define i1 @scalar_lshr_and_negC_eq_extra_use_lshr_and(i32 %x, i32 %y, i32 %z, pt define i1 @scalar_i32_lshr_and_negC_eq_X_is_constant1(i32 %y) { ; CHECK-LABEL: @scalar_i32_lshr_and_negC_eq_X_is_constant1( ; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 12345, [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[LSHR]], 8 +; CHECK-NEXT: [[R:%.*]] = icmp samesign ult i32 [[LSHR]], 8 ; CHECK-NEXT: ret i1 [[R]] ; %lshr = lshr i32 12345, %y diff --git a/llvm/test/Transforms/InstCombine/memchr.ll b/llvm/test/Transforms/InstCombine/memchr.ll index 08435a5e0388e58960f5e2e4ed14f496e252353b..0cf85a2a1b551994a9bec6d55f9ee6ad8237338a 100644 --- a/llvm/test/Transforms/InstCombine/memchr.ll +++ b/llvm/test/Transforms/InstCombine/memchr.ll @@ -124,7 +124,7 @@ define i1 @test11(i32 %C) { ; CHECK-LABEL: @test11( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i16 ; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 255 -; CHECK-NEXT: [[MEMCHR_BOUNDS:%.*]] = icmp ult i16 [[TMP2]], 16 +; CHECK-NEXT: [[MEMCHR_BOUNDS:%.*]] = icmp samesign ult i16 [[TMP2]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i16 1, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP3]], 9217 ; CHECK-NEXT: [[MEMCHR_BITS:%.*]] = icmp ne i16 [[TMP4]], 0 diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll index 26cd4996e687d521befc0c412db7fd971e88c072..ec1c7aff4096616c2b449c66c24b2fb945eebfb8 100644 --- a/llvm/test/Transforms/InstCombine/minmax-fold.ll +++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll @@ -852,10 +852,8 @@ define i32 @common_factor_umax_extra_use_both(i32 %a, i32 %b, i32 %c) { define float @not_min_of_min(i8 %i, float %x) { ; CHECK-LABEL: @not_min_of_min( -; CHECK-NEXT: [[CMP1_INV:%.*]] = fcmp fast oge float [[X:%.*]], 1.000000e+00 -; CHECK-NEXT: [[MIN1:%.*]] = select fast i1 [[CMP1_INV]], float 1.000000e+00, float [[X]] -; CHECK-NEXT: [[CMP2_INV:%.*]] = fcmp fast oge float [[X]], 2.000000e+00 -; CHECK-NEXT: [[MIN2:%.*]] = select fast i1 [[CMP2_INV]], float 2.000000e+00, float [[X]] +; CHECK-NEXT: [[MIN1:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 1.000000e+00) +; CHECK-NEXT: [[MIN2:%.*]] = call fast float @llvm.minnum.f32(float [[X]], float 2.000000e+00) ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i8 [[I:%.*]], 16 ; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP3]], float [[MIN1]], float [[MIN2]] ; CHECK-NEXT: ret float [[R]] diff --git a/llvm/test/Transforms/InstCombine/minmax-fp.ll b/llvm/test/Transforms/InstCombine/minmax-fp.ll index b9e46caa63753a096907294f6e2d6e2f89b249a4..1276b7b3e3867daf6db6c81ca8ae675e94eaf813 100644 --- a/llvm/test/Transforms/InstCombine/minmax-fp.ll +++ b/llvm/test/Transforms/InstCombine/minmax-fp.ll @@ -160,8 +160,7 @@ define i8 @t9(float %a) { ; Either operand could be NaN, but fast modifier applied. define i8 @t11(float %a, float %b) { ; CHECK-LABEL: @t11( -; CHECK-NEXT: [[DOTINV:%.*]] = fcmp fast oge float [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[DOTV:%.*]] = select fast i1 [[DOTINV]], float [[A]], float [[B]] +; CHECK-NEXT: [[DOTV:%.*]] = call fast float @llvm.minnum.f32(float [[B:%.*]], float [[A:%.*]]) ; CHECK-NEXT: [[TMP1:%.*]] = fptosi float [[DOTV]] to i8 ; CHECK-NEXT: ret i8 [[TMP1]] ; @@ -282,8 +281,7 @@ define float @fneg_fmax(float %x, float %y) { define <2 x float> @fsub_fmax(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: @fsub_fmax( -; CHECK-NEXT: [[COND_INV:%.*]] = fcmp nnan nsz ogt <2 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[MAX_V:%.*]] = select nnan nsz <2 x i1> [[COND_INV]], <2 x float> [[Y]], <2 x float> [[X]] +; CHECK-NEXT: [[MAX_V:%.*]] = call nnan nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) ; CHECK-NEXT: [[MAX:%.*]] = fneg <2 x float> [[MAX_V]] ; CHECK-NEXT: ret <2 x float> [[MAX]] ; @@ -310,8 +308,7 @@ define <2 x double> @fsub_fmin(<2 x double> %x, <2 x double> %y) { define double @fneg_fmin(double %x, double %y) { ; CHECK-LABEL: @fneg_fmin( -; CHECK-NEXT: [[COND_INV:%.*]] = fcmp nnan nsz olt double [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[MAX_V:%.*]] = select nnan nsz i1 [[COND_INV]], double [[Y]], double [[X]] +; CHECK-NEXT: [[MAX_V:%.*]] = call nnan nsz double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) ; CHECK-NEXT: [[MAX:%.*]] = fneg double [[MAX_V]] ; CHECK-NEXT: ret double [[MAX]] ; diff --git a/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll b/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll index 6a054688753500dac87776d7936aa725989c36dc..9ff093635ad7935ca35e2febc3e4b5526e9cc55d 100644 --- a/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll +++ b/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll @@ -20,7 +20,7 @@ define void @phi_recurrence_start_first() { ; CHECK-NEXT: br i1 [[COND_V2]], label [[FOR_COND11:%.*]], label [[FOR_COND26]] ; CHECK: for.cond11: ; CHECK-NEXT: [[I_1:%.*]] = phi i32 [ [[START]], [[IF_THEN]] ], [ [[STEP:%.*]], [[FOR_COND11]] ] -; CHECK-NEXT: [[CMP13:%.*]] = icmp ult i32 [[I_1]], 100 +; CHECK-NEXT: [[CMP13:%.*]] = icmp samesign ult i32 [[I_1]], 100 ; CHECK-NEXT: [[STEP]] = add nuw nsw i32 [[I_1]], 1 ; CHECK-NEXT: br i1 [[CMP13]], label [[FOR_COND11]], label [[WHILE_END]] ; CHECK: for.cond26: @@ -68,7 +68,7 @@ define void @phi_recurrence_step_first() { ; CHECK-NEXT: br i1 [[COND_V2]], label [[FOR_COND11:%.*]], label [[FOR_COND26]] ; CHECK: for.cond11: ; CHECK-NEXT: [[I_1:%.*]] = phi i32 [ [[STEP:%.*]], [[FOR_COND11]] ], [ [[START]], [[IF_THEN]] ] -; CHECK-NEXT: [[CMP13:%.*]] = icmp ult i32 [[I_1]], 100 +; CHECK-NEXT: [[CMP13:%.*]] = icmp samesign ult i32 [[I_1]], 100 ; CHECK-NEXT: [[STEP]] = add nuw nsw i32 [[I_1]], 1 ; CHECK-NEXT: br i1 [[CMP13]], label [[FOR_COND11]], label [[WHILE_END]] ; CHECK: for.cond26: diff --git a/llvm/test/Transforms/InstCombine/pr100298.ll b/llvm/test/Transforms/InstCombine/pr100298.ll index 6cf2a71bb916e5dc6e1f82ae4ed61448495b804b..eff67edd2b032eef256f739824cf7fb11e810f53 100644 --- a/llvm/test/Transforms/InstCombine/pr100298.ll +++ b/llvm/test/Transforms/InstCombine/pr100298.ll @@ -11,11 +11,11 @@ define i16 @pr100298() { ; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ -15, %[[ENTRY]] ], [ [[MASK:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[INDVAR]], 9 ; CHECK-NEXT: [[MASK]] = and i32 [[ADD]], 65535 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[MASK]], 5 +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[MASK]], 5 ; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_INC]], label %[[FOR_END:.*]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[ADD]] to i16 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[MASK]], 3 +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ugt i32 [[MASK]], 3 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i16 [[CONV]], 14 ; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP2]], i16 [[CONV]], i16 [[SHL]] ; CHECK-NEXT: ret i16 [[RES]] diff --git a/llvm/test/Transforms/InstCombine/pr63791.ll b/llvm/test/Transforms/InstCombine/pr63791.ll index 73a559f9892612f49e69be7d2bcf0d75bd7b4a08..78cc1130fb33f0cdbab6c6ba5050681b696693ff 100644 --- a/llvm/test/Transforms/InstCombine/pr63791.ll +++ b/llvm/test/Transforms/InstCombine/pr63791.ll @@ -15,7 +15,7 @@ define void @y() { ; CHECK-NEXT: store i1 true, ptr poison, align 1 ; CHECK-NEXT: br i1 poison, label [[FOR_COND_I]], label [[FOR_COND5_PREHEADER_I]] ; CHECK: for.cond5.preheader.i: -; CHECK-NEXT: br i1 true, label [[FOR_COND1_LOOPEXIT_I:%.*]], label [[FOR_INC19_I:%.*]] +; CHECK-NEXT: br i1 false, label [[FOR_INC19_I:%.*]], label [[FOR_COND1_LOOPEXIT_I:%.*]] ; CHECK: for.inc19.i: ; CHECK-NEXT: br i1 poison, label [[FOR_INC19_I]], label [[FOR_COND1_LOOPEXIT_I]] ; diff --git a/llvm/test/Transforms/InstCombine/rem.ll b/llvm/test/Transforms/InstCombine/rem.ll index 2cf56dfd50a8766eca33a38038719b6fe769aab9..4262ef85553b64bce62a380fa4d1fb61c23b013a 100644 --- a/llvm/test/Transforms/InstCombine/rem.ll +++ b/llvm/test/Transforms/InstCombine/rem.ll @@ -1045,7 +1045,7 @@ define <2 x i32> @PR62401(<2 x i1> %x, <2 x i32> %y) { define i16 @rem_pow2_or_zero(i16 %x, i16 %y) { ; CHECK-LABEL: @rem_pow2_or_zero( ; CHECK-NEXT: [[POPCNT:%.*]] = call range(i16 1, 17) i16 @llvm.ctpop.i16(i16 [[Y:%.*]]) -; CHECK-NEXT: [[COND:%.*]] = icmp ult i16 [[POPCNT]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ult i16 [[POPCNT]], 2 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[COND]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[Y]], -1 ; CHECK-NEXT: [[REM:%.*]] = and i16 [[X:%.*]], [[TMP1]] @@ -1130,7 +1130,7 @@ define i64 @rem_pow2_or_zero_domcond(i64 %a, i64 %b) { ; CHECK-LABEL: @rem_pow2_or_zero_domcond( ; CHECK-NEXT: start: ; CHECK-NEXT: [[CPOP:%.*]] = call range(i64 0, 65) i64 @llvm.ctpop.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[COND:%.*]] = icmp ult i64 [[CPOP]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ult i64 [[CPOP]], 2 ; CHECK-NEXT: br i1 [[COND]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[B]], -1 diff --git a/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll b/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll index 4123bc5557899a65acc5002963cda7a898f11788..56580a8b68c68120537821f3a3c715d0e7e127e6 100644 --- a/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll +++ b/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll @@ -7,7 +7,7 @@ define double @test_mul_fast_flags(ptr %arr_d) { ; CHECK: for.body: ; CHECK-NEXT: [[I_02:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret double 0.000000e+00 @@ -37,7 +37,7 @@ define double @test_nsz_nnan_flags_enabled(ptr %arr_d) { ; CHECK: for.body: ; CHECK-NEXT: [[I_02:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret double 0.000000e+00 @@ -71,7 +71,7 @@ define double @test_nnan_flag_enabled(ptr %arr_d) { ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[MUL]] = fmul nnan double [[F_PROD_01]], [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret double [[MUL]] @@ -105,7 +105,7 @@ define double @test_ninf_flag_enabled(ptr %arr_d) { ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[MUL]] = fmul ninf double [[F_PROD_01]], [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret double [[MUL]] @@ -139,7 +139,7 @@ define double @test_nsz_flag_enabled(ptr %arr_d) { ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[MUL]] = fmul nsz double [[F_PROD_01]], [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret double [[MUL]] @@ -173,7 +173,7 @@ define double @test_phi_initalise_to_non_zero(ptr %arr_d) { ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[MUL]] = fmul fast double [[F_PROD_01]], [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret double [[MUL]] @@ -285,7 +285,7 @@ define i32 @test_int_phi_operands(ptr %arr_d) { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL]] = mul nsw i32 [[F_PROD_01]], [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret i32 [[MUL]] @@ -319,7 +319,7 @@ define i32 @test_int_phi_operands_initalise_to_non_zero(ptr %arr_d) { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL]] = mul i32 [[F_PROD_01]], [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_02]], 999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret i32 [[MUL]] diff --git a/llvm/test/Transforms/InstCombine/select-cmp.ll b/llvm/test/Transforms/InstCombine/select-cmp.ll index 234815949d77d404e6730a51fb48c016b66b0ef5..f7505bd85f89eb2dfe8b3bdc3407a98b75acbaaf 100644 --- a/llvm/test/Transforms/InstCombine/select-cmp.ll +++ b/llvm/test/Transforms/InstCombine/select-cmp.ll @@ -480,6 +480,95 @@ define i1 @test_select_inverse_nonconst4(i64 %x, i64 %y, i64 %z, i1 %cond) { ret i1 %sel } +define i1 @test_select_inverse_samesign_true_arm(i64 %x, i64 %y, i1 %cond) { +; CHECK-LABEL: @test_select_inverse_samesign_true_arm( +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ult i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp uge i64 [[X]], [[Y]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]] +; CHECK-NEXT: ret i1 [[SEL]] +; + %cmp1 = icmp samesign ult i64 %x, %y + %cmp2 = icmp uge i64 %x, %y + %sel = select i1 %cond, i1 %cmp1, i1 %cmp2 + ret i1 %sel +} + +define i1 @test_select_inverse_samesign_false_arm(i64 %x, i64 %y, i1 %cond) { +; CHECK-LABEL: @test_select_inverse_samesign_false_arm( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign uge i64 [[X]], [[Y]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]] +; CHECK-NEXT: ret i1 [[SEL]] +; + %cmp1 = icmp ult i64 %x, %y + %cmp2 = icmp samesign uge i64 %x, %y + %sel = select i1 %cond, i1 %cmp1, i1 %cmp2 + ret i1 %sel +} + +define i1 @test_select_inverse_samesign_both(i64 %x, i64 %y, i1 %cond) { +; CHECK-LABEL: @test_select_inverse_samesign_both( +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign uge i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]] +; CHECK-NEXT: ret i1 [[SEL]] +; + %cmp1 = icmp samesign ult i64 %x, %y + %cmp2 = icmp samesign uge i64 %x, %y + %sel = select i1 %cond, i1 %cmp1, i1 %cmp2 + ret i1 %sel +} + +define i1 @test_select_inverse_samesign_false_arm_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) { +; CHECK-LABEL: @test_select_inverse_samesign_false_arm_rhsc_same_sign( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[X:%.*]], 11 +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ugt i64 [[X]], 10 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]] +; CHECK-NEXT: ret i1 [[SEL]] +; + %cmp1 = icmp ult i64 %x, 11 + %cmp2 = icmp samesign ugt i64 %x, 10 + %sel = select i1 %cond, i1 %cmp1, i1 %cmp2 + ret i1 %sel +} + +define i1 @test_select_inverse_samesign_true_arm_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) { +; CHECK-LABEL: @test_select_inverse_samesign_true_arm_rhsc_same_sign( +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ult i64 [[X:%.*]], 11 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i64 [[X]], 10 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]] +; CHECK-NEXT: ret i1 [[SEL]] +; + %cmp1 = icmp samesign ult i64 %x, 11 + %cmp2 = icmp ugt i64 %x, 10 + %sel = select i1 %cond, i1 %cmp1, i1 %cmp2 + ret i1 %sel +} + +define i1 @test_select_inverse_samesign_both_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) { +; CHECK-LABEL: @test_select_inverse_samesign_both_rhsc_same_sign( +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ugt i64 [[X:%.*]], 10 +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]] +; CHECK-NEXT: ret i1 [[SEL]] +; + %cmp1 = icmp samesign ult i64 %x, 11 + %cmp2 = icmp samesign ugt i64 %x, 10 + %sel = select i1 %cond, i1 %cmp1, i1 %cmp2 + ret i1 %sel +} + +define i1 @test_select_inverse_samesign_both_rhsc_diff_sign(i64 %x, i64 %y, i1 %cond) { +; CHECK-LABEL: @test_select_inverse_samesign_both_rhsc_diff_sign( +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign slt i64 [[X:%.*]], 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign sgt i64 [[X]], -1 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]] +; CHECK-NEXT: ret i1 [[SEL]] +; + %cmp1 = icmp samesign slt i64 %x, 0 + %cmp2 = icmp samesign sgt i64 %x, -1 + %sel = select i1 %cond, i1 %cmp1, i1 %cmp2 + ret i1 %sel +} + define i1 @sel_icmp_two_cmp(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) { ; CHECK-LABEL: @sel_icmp_two_cmp( ; CHECK-NEXT: [[CMP1:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]] diff --git a/llvm/test/Transforms/InstCombine/select-icmp-xor.ll b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll new file mode 100644 index 0000000000000000000000000000000000000000..c8ce114a683eeef62aa70a4f05117190e8ed88ac --- /dev/null +++ b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll @@ -0,0 +1,190 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=instcombine -S %s | FileCheck %s + +define i8 @select_icmp_eq_pow2(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_eq_pow2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], -5 +; CHECK-NEXT: ret i8 [[SEL]] +; + %and = and i8 %x, 4 + %icmp = icmp eq i8 %and, 0 + %xor = xor i8 %x, 4 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_eq_pow2_flipped(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_eq_pow2_flipped( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], 4 +; CHECK-NEXT: ret i8 [[SEL]] +; + %and = and i8 %x, 4 + %icmp = icmp eq i8 %and, 0 + %xor = xor i8 %x, 4 + %sel = select i1 %icmp, i8 %xor, i8 %x + ret i8 %sel +} + +define i8 @select_icmp_eq_not_pow2(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_eq_not_pow2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 5 +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i8 [[AND]], 0 +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], 5 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %and = and i8 %x, 5 + %icmp = icmp eq i8 %and, 0 + %xor = xor i8 %x, 5 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_ne_pow2(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_ne_pow2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], -5 +; CHECK-NEXT: ret i8 [[SEL]] +; + %and = and i8 %x, 4 + %icmp = icmp ne i8 %and, 0 + %xor = xor i8 %x, 4 + %sel = select i1 %icmp, i8 %xor, i8 %x + ret i8 %sel +} + +define i8 @select_icmp_ne_pow2_flipped(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_ne_pow2_flipped( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], 4 +; CHECK-NEXT: ret i8 [[SEL]] +; + %and = and i8 %x, 4 + %icmp = icmp ne i8 %and, 0 + %xor = xor i8 %x, 4 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_ne_not_pow2(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_ne_not_pow2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 5 +; CHECK-NEXT: [[ICMP_NOT:%.*]] = icmp eq i8 [[AND]], 0 +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], 5 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP_NOT]], i8 [[X]], i8 [[XOR]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %and = and i8 %x, 5 + %icmp = icmp ne i8 %and, 0 + %xor = xor i8 %x, 5 + %sel = select i1 %icmp, i8 %xor, i8 %x + ret i8 %sel +} + +define i8 @select_icmp_slt_zero_smin(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], -128 +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp slt i8 %x, 0 + %xor = xor i8 %x, -128 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_slt_zero_smin_flipped(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin_flipped( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], 127 +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp slt i8 %x, 0 + %xor = xor i8 %x, -128 + %sel = select i1 %icmp, i8 %xor, i8 %x + ret i8 %sel +} + +define i8 @select_icmp_slt_not_zero(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_slt_not_zero( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp slt i8 [[X]], 1 +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -128 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp slt i8 %x, 1 + %xor = xor i8 %x, -128 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_slt_not_smin(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_slt_not_smin( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp slt i8 [[X]], 0 +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -127 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp slt i8 %x, 0 + %xor = xor i8 %x, -127 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_sgt_allones_smin(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = and i8 [[X]], 127 +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp sgt i8 %x, 255 + %xor = xor i8 %x, -128 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_sgt_allones_smin_flipped(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin_flipped( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[SEL:%.*]] = or i8 [[X]], -128 +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp sgt i8 %x, 255 + %xor = xor i8 %x, -128 + %sel = select i1 %icmp, i8 %xor, i8 %x + ret i8 %sel +} + +define i8 @select_icmp_sgt_not_allones(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_sgt_not_allones( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i8 [[X]], -2 +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -128 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp sgt i8 %x, 254 + %xor = xor i8 %x, -128 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} + +define i8 @select_icmp_sgt_not_smin(i8 %x) { +; CHECK-LABEL: define i8 @select_icmp_sgt_not_smin( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], -127 +; CHECK-NEXT: [[ICMP1:%.*]] = icmp slt i8 [[X]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[ICMP1]], i8 [[XOR]], i8 [[X]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %icmp = icmp sgt i8 %x, 255 + %xor = xor i8 %x, -127 + %sel = select i1 %icmp, i8 %x, i8 %xor + ret i8 %sel +} diff --git a/llvm/test/Transforms/InstCombine/simplify-libcalls.ll b/llvm/test/Transforms/InstCombine/simplify-libcalls.ll index bb2728a103ec6cbb7a33167c4d11e9c0025ddec6..c4ae7e7e16bcf4300ed9af84157aa6f6f1df31b3 100644 --- a/llvm/test/Transforms/InstCombine/simplify-libcalls.ll +++ b/llvm/test/Transforms/InstCombine/simplify-libcalls.ll @@ -342,5 +342,17 @@ define signext i32 @emit_stpncpy() { ret i32 0 } +define void @simplify_memset_chk_pr112633(ptr %p, i32 %conv) { +; CHECK-LABEL: @simplify_memset_chk_pr112633( +; CHECK-NEXT: [[CALL_I:%.*]] = tail call ptr @__memset_chk(ptr [[P:%.*]], i32 range(i32 0, 123) [[CONV:%.*]], i64 1, i64 1) +; CHECK-NEXT: ret void +; + %call.i = tail call ptr @__memset_chk(ptr %p, i32 range(i32 0, 123) %conv, i64 1, i64 1) + ret void +} + +declare ptr @__memset_chk(ptr, i32, i64, i64) + + attributes #0 = { nobuiltin } attributes #1 = { builtin } diff --git a/llvm/test/Transforms/InstCombine/strchr-1.ll b/llvm/test/Transforms/InstCombine/strchr-1.ll index 0cedc3ad518137b6af03b4817d9a86892c896c7a..50ef6abb2107e77a0791c8f3c100820314c69c6e 100644 --- a/llvm/test/Transforms/InstCombine/strchr-1.ll +++ b/llvm/test/Transforms/InstCombine/strchr-1.ll @@ -86,7 +86,7 @@ define i1 @test_simplify7(i32 %C) { ; CHECK-LABEL: @test_simplify7( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i16 ; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 255 -; CHECK-NEXT: [[MEMCHR_BOUNDS:%.*]] = icmp ult i16 [[TMP2]], 16 +; CHECK-NEXT: [[MEMCHR_BOUNDS:%.*]] = icmp samesign ult i16 [[TMP2]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i16 1, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP3]], 9217 ; CHECK-NEXT: [[MEMCHR_BITS:%.*]] = icmp ne i16 [[TMP4]], 0 diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll index ba596e10e8b3fef65be7ae30e251f868baed5994..ff3f046607ec3a61ff78d0e45531f1bc92205115 100644 --- a/llvm/test/Transforms/InstCombine/sub.ll +++ b/llvm/test/Transforms/InstCombine/sub.ll @@ -2710,7 +2710,7 @@ if.else: define i64 @sub_infer_nuw_from_domcond_fold2(i32 range(i32 0, 2147483648) %x, i32 range(i32 0, 2147483648) %y) { ; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold2( -; CHECK-NEXT: [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[COND_NOT:%.*]] = icmp samesign ult i32 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 [[X]], [[Y]] @@ -2734,7 +2734,7 @@ if.else: define i1 @sub_infer_nuw_from_domcond_fold3(i16 %xx, i32 range(i32 0, 12) %y) { ; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold3( ; CHECK-NEXT: [[X:%.*]] = zext i16 [[XX:%.*]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[Y:%.*]], [[X]] +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ugt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: ; CHECK-NEXT: ret i1 false diff --git a/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll b/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll index 62c12c15a075cd6470afff9942ea89beefc5dfc9..b164dd983a8925ca4c1b2191ad623d217efd7db3 100644 --- a/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll +++ b/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll @@ -115,7 +115,7 @@ define float @select_max_ugt_2_use_cmp(float %a, float %b) { ; CHECK-LABEL: @select_max_ugt_2_use_cmp( ; CHECK-NEXT: [[CMP:%.*]] = fcmp reassoc ugt float [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: call void @foo(i1 [[CMP]]) -; CHECK-NEXT: [[SEL:%.*]] = select fast i1 [[CMP]], float [[A]], float [[B]] +; CHECK-NEXT: [[SEL:%.*]] = call fast float @llvm.maxnum.f32(float [[A]], float [[B]]) ; CHECK-NEXT: ret float [[SEL]] ; %cmp = fcmp reassoc ugt float %a, %b diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr76504.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr76504.ll new file mode 100644 index 0000000000000000000000000000000000000000..94b9f7badb0f6d17495530591fc1048f0a222ea6 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr76504.ll @@ -0,0 +1,30 @@ +; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=65323 by @RKSimon +; +; RUN: opt -S -passes=loop-reduce %s | FileCheck %s +; +; Make sure we don't trigger an assertion. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@G = external global i32 + +define void @foo() { +; CHECK-LABEL: foo +bb8: + br label %bb30 + +bb30: ; preds = %bb30, %bb8 + %l0 = phi i64 [ -2222, %bb8 ], [ %r23, %bb30 ] + %A22 = alloca i16, align 2 + %r23 = add nuw i64 1, %l0 + %G7 = getelementptr i16, ptr %A22, i64 %r23 + %B15 = urem i64 %r23, %r23 + %G6 = getelementptr i16, ptr %G7, i64 %B15 + %B1 = urem i64 %r23, %r23 + %B8 = sub i64 -1, %r23 + %B18 = sub i64 %B8, %B1 + %G5 = getelementptr i16, ptr %G6, i64 %B18 + store ptr %G5, ptr undef, align 8 + br label %bb30 +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 7f325ce1a1f04b3402a6b3fb19ea94bad3d8cef3..01fca39296da092d5213184dd0c0709a7aba2963 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -732,9 +732,20 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-LABEL: define void @multiple_exit_conditions( ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: entry: -; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: -; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048 +; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]] +; DEFAULT-NEXT: [[TMP17:%.*]] = mul i64 [[N_VEC]], 8 +; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]] +; DEFAULT-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2 +; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 ; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] ; DEFAULT: vector.body: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -742,20 +753,39 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], -; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double> +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP9:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) +; DEFAULT-NEXT: [[TMP10:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) +; DEFAULT-NEXT: [[TMP11:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) +; DEFAULT-NEXT: [[TMP12:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) +; DEFAULT-NEXT: [[TMP13:%.*]] = uitofp [[TMP9]] to +; DEFAULT-NEXT: [[TMP14:%.*]] = uitofp [[TMP10]] to +; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp [[TMP11]] to +; DEFAULT-NEXT: [[TMP16:%.*]] = uitofp [[TMP12]] to ; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0 -; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[TMP4]], align 8 -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; DEFAULT-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; DEFAULT-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; DEFAULT-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]] +; DEFAULT-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 +; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]] +; DEFAULT-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24 +; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]] +; DEFAULT-NEXT: store [[TMP13]], ptr [[TMP4]], align 8 +; DEFAULT-NEXT: store [[TMP14]], ptr [[TMP20]], align 8 +; DEFAULT-NEXT: store [[TMP15]], ptr [[TMP23]], align 8 +; DEFAULT-NEXT: store [[TMP16]], ptr [[TMP26]], align 8 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; DEFAULT-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; DEFAULT: middle.block: -; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: ; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ] -; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: vector.scevcheck: ; DEFAULT-NEXT: unreachable @@ -780,7 +810,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: ; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 257, [[TMP2]] ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] @@ -789,31 +819,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] ; PRED-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2 ; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 ; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]] ; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]] ; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257) +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257) ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; PRED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0 ; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] ; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP12]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[TMP13:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) -; PRED-NEXT: [[TMP14:%.*]] = uitofp [[TMP13]] to +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP12]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP13:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) +; PRED-NEXT: [[TMP14:%.*]] = uitofp [[TMP13]] to ; PRED-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0 -; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: call void @llvm.masked.store.nxv8f64.p0( [[TMP14]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]]) -; PRED-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; PRED-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]]) +; PRED-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; PRED-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 ; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 8080c3a9ba0a7d11d7b0da3670f2bf4744ce4463..36eee8d0c98ceaed138f72d22c04201971fd0b97 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -110,7 +110,6 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 2 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[TMP9]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64> @@ -169,7 +168,6 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll index f28f77bf1b15582c2970f28b8d1bd2c4f4e79892..59da1e10fd2a07c3d2172429a9a565626dfee6a2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll @@ -1,23 +1,23 @@ ; REQUIRES: asserts ; RUN: opt -mtriple=aarch64 -mattr=+sve \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4 +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4 +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4 +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16 ; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2). ; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2). @@ -29,7 +29,7 @@ ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). ; VF-4: <4 x i32> -; VF-VSCALE4: <16 x i32> +; VF-VSCALE16: define void @test0(ptr %a, ptr %b, ptr %c) #0 { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll index e83eb729b521c2e5831aafa62d0e108ee45a0ae4..a84932a2290d67e16402af030c345ab7168f0725 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -8,8 +8,8 @@ ; (maximized bandwidth for i8 in the loop). define void @test0(ptr %a, ptr %b, ptr %c) #0 { ; CHECK: LV: Checking a loop in 'test0' -; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: 16 +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll new file mode 100644 index 0000000000000000000000000000000000000000..82556bdd2a5ec1772bff7b7f071a3aa4327c3c34 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S < %s -p loop-vectorize | FileCheck %s --check-prefixes=CHECK + +target triple = "aarch64-unknown-linux-gnu" + +declare void @init_mem(ptr, i64); + +define i64 @same_exit_block_pre_inc_use1() #1 { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX2]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +; In this example the early exit block appears in the list of ExitNotTaken +; SCEVs, but is not computable. +define i64 @same_exit_block_pre_inc_use4() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use4() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i64], align 8 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i64], align 8 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP1:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i64] + %p2 = alloca [1024 x i64] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i64, ptr %p1, i64 %index + %ld1 = load i64, ptr %arrayidx, align 1 + %cmp3 = icmp ult i64 %index, %ld1 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_safe_call() #1 { +; CHECK-LABEL: define i64 @loop_contains_safe_call( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP1:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX2]] +; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index + %ld1 = load float, ptr %arrayidx, align 1 + %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1) + %cmp = fcmp fast ult float %sqrt, 3.0e+00 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_safe_div() #1 { +; CHECK-LABEL: define i64 @loop_contains_safe_div( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %div = udiv i32 %ld1, 20000 + %cmp = icmp eq i32 %div, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) { +; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' +; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 +; DEBUG-NEXT: LV: We can vectorize this loop! +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit( +; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br label [[LOOP1:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ [[LD2]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %cmp = icmp eq i32 %ld1, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index + %ld2 = load i64, ptr %arrayidx2, align 8 + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ] + ret i64 %retval +} + + +; The form of the induction variables requires SCEV predicates. +define i32 @diff_exit_block_needs_scev_check(i32 %end) { +; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' +; DEBUG: Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) +; DEBUG-NEXT: LV: We can vectorize this loop! +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check( +; CHECK-SAME: i32 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END]], 1023 +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 +; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY1]], label [[EXIT:%.*]] +; CHECK: found: +; CHECK-NEXT: ret i32 1 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %p1 = alloca [1024 x i32] + %p2 = alloca [1024 x i32] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + %end.clamped = and i32 %end, 1023 + br label %for.body + +for.body: + %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ] + %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ] + %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind + %0 = load i32, ptr %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind + %1 = load i32, ptr %arrayidx2, align 4 + %cmp.early = icmp eq i32 %0, %1 + br i1 %cmp.early, label %found, label %for.inc + +for.inc: + %ind.next = add i8 %ind, 1 + %conv = zext i8 %ind.next to i32 + %gep.ind.next = add i64 %gep.ind, 1 + %cmp = icmp ult i32 %conv, %end.clamped + br i1 %cmp, label %for.body, label %exit + +found: + ret i32 1 + +exit: + ret i32 0 +} + + +declare i32 @foo(i32) readonly +declare @foo_vec() + +attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } +attributes #1 = { "target-features"="+sve" vscale_range(1,16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index 7d2fc348480a0966c71bfc5cd514c0590b6e3384..a4861ad0b2619686af15c965fabe441d6af76d1e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -145,7 +145,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; DEFAULT-NEXT: iter.check: ; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; DEFAULT-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; DEFAULT: vector.memcheck: @@ -155,59 +155,72 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; DEFAULT-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; DEFAULT: vector.main.loop.iter.check: -; DEFAULT-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP9]], 32 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 0, [[TMP3]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8> +; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 +; DEFAULT-NEXT: [[N_MOD_VF1:%.*]] = urem i64 0, [[TMP5]] +; DEFAULT-NEXT: [[N_VEC1:%.*]] = sub i64 0, [[N_MOD_VF1]] +; DEFAULT-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 32 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP8:%.*]] = trunc [[BROADCAST_SPLAT]] to ; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] ; DEFAULT: vector.body: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; DEFAULT-NEXT: [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8> -; DEFAULT-NEXT: [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]] -; DEFAULT-NEXT: [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]] +; DEFAULT-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP11:%.*]] = trunc [[BROADCAST_SPLAT3]] to +; DEFAULT-NEXT: [[TMP22:%.*]] = and [[TMP11]], [[TMP8]] +; DEFAULT-NEXT: [[TMP13:%.*]] = and [[TMP11]], [[TMP8]] ; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]] ; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 -; DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16 -; DEFAULT-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]] -; DEFAULT-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]] -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; DEFAULT-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; DEFAULT-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP23:%.*]] = mul i64 [[TMP16]], 16 +; DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP23]] +; DEFAULT-NEXT: store [[TMP22]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]] +; DEFAULT-NEXT: store [[TMP13]], ptr [[TMP24]], align 1, !alias.scope [[META8]], !noalias [[META5]] +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; DEFAULT-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC1]] +; DEFAULT-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; DEFAULT: middle.block: -; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; DEFAULT-NEXT: [[CMP_N1:%.*]] = icmp eq i64 0, [[N_VEC1]] +; DEFAULT-NEXT: br i1 [[CMP_N1]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: +; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 0, [[N_VEC1]] ; DEFAULT-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP16]] +; DEFAULT-NEXT: [[TMP31:%.*]] = mul i64 [[TMP15]], 8 +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP31]] ; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; DEFAULT: vec.epilog.ph: -; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 +; DEFAULT-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 8 ; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP18]] ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] ; DEFAULT-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[X]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP24:%.*]] = trunc [[BROADCAST_SPLAT7]] to +; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP32:%.*]] = trunc [[BROADCAST_SPLAT7]] to ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: ; DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX5]], 0 -; DEFAULT-NEXT: [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP22]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP23:%.*]] = trunc [[BROADCAST_SPLAT]] to -; DEFAULT-NEXT: [[TMP25:%.*]] = and [[TMP23]], [[TMP24]] +; DEFAULT-NEXT: [[TMP33:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP33]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP29:%.*]] = trunc [[BROADCAST_SPLAT10]] to +; DEFAULT-NEXT: [[TMP30:%.*]] = and [[TMP29]], [[TMP32]] ; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP21]] ; DEFAULT-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 -; DEFAULT-NEXT: store [[TMP25]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] +; DEFAULT-NEXT: store [[TMP30]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] ; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], [[TMP20]] ; DEFAULT-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -215,7 +228,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -234,7 +247,10 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-LABEL: define void @trunc_store( ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; PRED-NEXT: entry: -; PRED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; PRED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP7]], 16 +; PRED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] +; PRED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; PRED: vector.memcheck: ; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 ; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] @@ -242,28 +258,35 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; PRED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: -; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer -; PRED-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT2]] to <16 x i8> +; PRED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; PRED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP4]], 16 +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP12:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer -; PRED-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT]] to <16 x i8> -; PRED-NEXT: [[TMP4:%.*]] = and <16 x i8> [[TMP2]], [[TMP3]] +; PRED-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] +; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; PRED-NEXT: [[TMP9:%.*]] = trunc [[BROADCAST_SPLAT2]] to +; PRED-NEXT: [[TMP10:%.*]] = and [[TMP9]], [[TMP12]] ; PRED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] ; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 -; PRED-NEXT: store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] -; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; PRED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; PRED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: store [[TMP10]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] +; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; PRED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; PRED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index c6fb1c25274d9f8d0e8eeb7f244e41383cb66f5b..2c8271cf978d10a9c914d152ac93ed97d3bb7521 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -119,7 +119,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.+]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, [[VEC_IND]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0( [[TMP6]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP7:%.*]] = or disjoint [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) @@ -542,7 +542,7 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp samesign ult i64 [[TMP1]], [[TMP3]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT_NOT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP4:%.*]] = add nuw i64 [[TMP1]], 1 @@ -872,7 +872,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -957,7 +957,7 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 { ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -1050,7 +1050,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 { ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -1138,7 +1138,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 { ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -1242,7 +1242,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -1268,7 +1268,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[P:%.+]] = extractelement [[TMP13]], i64 0 +; CHECK-NEXT: [[P:%.*]] = extractelement [[TMP13]], i64 0 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[P]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] @@ -1333,7 +1333,7 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index baec7daa463d13ac4997d1948bab18aa4936e0ef..26ac9c3dead75acd9e6174f13fa4f97e4e8b526d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -35,17 +35,17 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP20]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 @@ -55,17 +55,17 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) ; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP8]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP13]], [[TMP16]]) ; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, [[INTERLEAVED_MASK1]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALAR_TAIL_FOLDING: middle.block: ; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -74,24 +74,24 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: for.body: ; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_024]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; SCALAR_TAIL_FOLDING: if.then: ; SCALAR_TAIL_FOLDING-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP22]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = zext nneg i32 [[MUL]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP18]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; SCALAR_TAIL_FOLDING-NEXT: [[ADD:%.*]] = or disjoint i32 [[MUL]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP24]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP23]], i8 [[TMP25]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP26]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = zext nneg i32 [[ADD]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP20]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP19]], i8 [[TMP21]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = zext nneg i32 [[MUL]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP22]] ; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I]], ptr [[ARRAYIDX6]], align 1 ; SCALAR_TAIL_FOLDING-NEXT: [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP27]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = zext nneg i32 [[ADD]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP23]] ; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB]], ptr [[ARRAYIDX11]], align 1 ; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] ; SCALAR_TAIL_FOLDING: for.inc: @@ -107,14 +107,14 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP20]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -122,29 +122,29 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) ; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP7]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, [[INTERLEAVED_MASK1]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP13]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -219,17 +219,17 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP15]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to ; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] @@ -239,10 +239,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to ; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] ; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP13]], i32 1, [[TMP10]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]] +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALAR_TAIL_FOLDING: middle.block: ; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -252,15 +252,15 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING: for.body: ; SCALAR_TAIL_FOLDING-NEXT: [[IX_012:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] ; SCALAR_TAIL_FOLDING-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_012]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg i32 [[MUL]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP15]] ; SCALAR_TAIL_FOLDING-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_012]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_012]], [[CONV]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; SCALAR_TAIL_FOLDING: if.then: ; SCALAR_TAIL_FOLDING-NEXT: [[ADD:%.*]] = or disjoint i32 [[MUL]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg i32 [[ADD]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP16]] ; SCALAR_TAIL_FOLDING-NEXT: store i8 2, ptr [[ARRAYIDX3]], align 1 ; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] ; SCALAR_TAIL_FOLDING: for.inc: @@ -276,14 +276,14 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP15]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -291,22 +291,22 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = zext nneg [[TMP6]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP7]] ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP13]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP13]], i32 1, [[TMP10]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -377,10 +377,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP16]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -389,7 +389,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg [[TMP7]] to @@ -400,10 +400,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to ; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] ; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP11]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]] +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALAR_TAIL_FOLDING: middle.block: ; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -413,20 +413,20 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING: for.body: ; SCALAR_TAIL_FOLDING-NEXT: [[IX_018:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] ; SCALAR_TAIL_FOLDING-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_018]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_018]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_018]], [[CONV]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; SCALAR_TAIL_FOLDING: if.then: -; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg i32 [[MUL]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP16]] ; SCALAR_TAIL_FOLDING-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 ; SCALAR_TAIL_FOLDING-NEXT: br label [[IF_END]] ; SCALAR_TAIL_FOLDING: if.end: -; SCALAR_TAIL_FOLDING-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[IX_018]], [[CONV3]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP4:%.*]] = icmp samesign ugt i32 [[IX_018]], [[CONV3]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP4]], label [[IF_THEN6:%.*]], label [[FOR_INC]] ; SCALAR_TAIL_FOLDING: if.then6: ; SCALAR_TAIL_FOLDING-NEXT: [[ADD:%.*]] = or disjoint i32 [[MUL]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP19]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg i32 [[ADD]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP17]] ; SCALAR_TAIL_FOLDING-NEXT: store i8 2, ptr [[ARRAYIDX7]], align 1 ; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] ; SCALAR_TAIL_FOLDING: for.inc: @@ -443,14 +443,14 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32 ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD1]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -460,24 +460,24 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP9]], i32 1, [[TMP10]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg [[TMP6]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP10]], i32 1, [[TMP8]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg [[TMP13]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP15]], i32 1, [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index 6a7263d6498535fdac6f4ea486b51d324eb718d1..0b3f28e8db5c4da1da2b0e2d24bfbb11f79470f5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -19,19 +19,19 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-EMPTY: ; VPLANS-NEXT: vector.ph: ; VPLANS-NEXT: EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 vp<[[TC]]> -; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0> -; VPLANS-NEXT: EMIT vp<[[LANEMASK_ENTRY:%[0-9]+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]> +; VPLANS-NEXT: EMIT vp<[[VF:%.+]]> = VF * Part + ir<0> +; VPLANS-NEXT: EMIT vp<[[LANEMASK_ENTRY:%.+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]> ; VPLANS-NEXT: Successor(s): vector loop ; VPLANS-EMPTY: ; VPLANS-NEXT: vector loop: { ; VPLANS-NEXT: vector.body: ; VPLANS-NEXT: EMIT vp<[[INDV:%[0-9]+]]> = CANONICAL-INDUCTION -; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%[0-9]+]]> +; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%.+]]> ; VPLANS-NEXT: vp<[[STEP:%[0-9]+]]> = SCALAR-STEPS vp<[[INDV]]>, ir<1> ; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]> ; VPLANS-NEXT: vp<[[VEC_PTR:%[0-9]+]]> = vector-pointer ir<%gep> ; VPLANS-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%val>, vp<[[LANEMASK_PHI]]> -; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = add vp<[[INDV]]>, vp<[[VFxUF]]> +; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%.+]]> = add vp<[[INDV]]>, vp<[[VFxUF]]> ; VPLANS-NEXT: EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]> ; VPLANS-NEXT: EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]>, vp<[[NEWTC]]> ; VPLANS-NEXT: EMIT vp<[[NOT:%[0-9]+]]> = not vp<[[LANEMASK_LOOP]]> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 15819070f1e137db5511c602cfa9bdc9cdb51a93..fd35fdee16136c1e9de560a21ce70700565c5b73 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -223,7 +223,7 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 0c41477f285d0a9d0ad4d16bed65e022fe30e683..04ac89518502aa55205a6e5a8e88bf2a43b818e8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -313,36 +313,68 @@ for.exit: define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { ; CHECK-LABEL: define void @histogram_8bit( ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 3 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -16 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP6]] -; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i8( [[TMP7]], i8 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP10]] +; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8( [[TMP20]], i8 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP12]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[DOTNEG8:%.*]] = mul nsw i64 [[TMP13]], -8 +; CHECK-NEXT: [[N_VEC3:%.*]] = and i64 [[N]], [[DOTNEG8]] +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 3 ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX4]] +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = zext [[WIDE_LOAD5]] to +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP17]] +; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv8p0.i8( [[TMP18]], i8 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_EXIT]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY2:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY2]] ] ; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_INDICES]], align 4 ; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 @@ -352,7 +384,7 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) ; CHECK-NEXT: store i8 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY2]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -393,7 +425,7 @@ define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N ; CHECK-NEXT: store float [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -436,7 +468,7 @@ define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %ind ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -494,7 +526,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP21]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -512,7 +544,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -564,7 +596,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 { ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP11]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -582,7 +614,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 { ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -634,7 +666,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP7]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -652,7 +684,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado ; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4 ; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -701,13 +733,13 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -774,7 +806,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -795,7 +827,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -887,7 +919,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv2p0.i64( [[TMP6]], i64 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -904,7 +936,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: store i64 [[INC]], ptr [[GEP_BUCKET]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll index dec3c286345adfc51beb3134a36c15d7ced9c14e..691c743be7d74dbf2dfb341eb440631f0cceea1e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -24,25 +24,25 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -107,25 +107,25 @@ define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index 4a2f9d07ed91c640dbeac87a68265dde87c8990a..a1a13f1e0c377b21037cdb077575a1175cb0463d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE -; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW +; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth=false -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll index 2fd00d67a43e6cc1eb058c84d18be20d44e87bef..f4dfdacac1b321df2e4d3626544c2071ea21850c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll @@ -18,7 +18,6 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 { ; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 ; CHECK-NEXT: [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 41796e848632e4f5091818d6e0e9d86f1f22a046..fc12dd54f88dfab101b8106cb1a4331361e813ab 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[VP_OP_LOAD]], zeroinitializer +; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP19]], [[VP_OP_LOAD]], zeroinitializer, i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] ; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 90c209cf3f518603e66ec61fe34f4e295f3e5144..1326751a847d7d5f2bdb25e4d745dea58e96d27b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -37,7 +37,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: vector loop: { ; IF-EVL-INLOOP-NEXT: vector.body: ; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> ; IF-EVL-INLOOP-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]> ; IF-EVL-INLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> @@ -48,7 +48,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>) ; IF-EVL-INLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> ; IF-EVL-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-NEXT: } @@ -86,7 +86,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> ; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> ; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]> -; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> ; NO-VP-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-NEXT: } @@ -125,7 +125,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> ; NO-VP-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> ; NO-VP-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + reduce.add (ir<[[LD1]]>) -; NO-VP-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> ; NO-VP-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index c14a8bce8f48d8222d1513afcf9dbb7b4cfac9c8..706b6f888298486482e23b3a1410a3aa3bf3db5d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -22,7 +22,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: vector loop: { ; IF-EVL-NEXT: vector.body: ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> @@ -38,7 +38,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> ; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; IF-EVL-NEXT: No successors ; IF-EVL-NEXT: } @@ -65,7 +65,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; NO-VP-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; NO-VP-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[ADD]]> -; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> ; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; NO-VP-NEXT: No successors ; NO-VP-NEXT: } @@ -110,7 +110,7 @@ define void @safe_dep(ptr %p) { ; CHECK-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]> ; CHECK-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; CHECK-NEXT: WIDEN store vp<[[PTR2]]>, ir<[[V]]> -; CHECK-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; CHECK-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll new file mode 100644 index 0000000000000000000000000000000000000000..6d6cfb5e9d18edd62427d94fec098a4ca27cb42c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll @@ -0,0 +1,65 @@ +; REQUIRES: asserts + + ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ + ; RUN: -force-tail-folding-style=data-with-evl \ + ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ + ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + + define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { + ; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { + ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF + ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count + ; IF-EVL-NEXT: Live-in ir<%N> = original trip-count + + ; IF-EVL: vector.ph: + ; IF-EVL-NEXT: Successor(s): vector loop + + ; IF-EVL: vector loop: { + ; IF-EVL-NEXT: vector.body: + ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION + ; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%.+]]> + ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> + ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> + ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> + ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> + ; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> + ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> + ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]> + ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>) + ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> + ; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> + ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 + ; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> + ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> + ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> + ; IF-EVL-NEXT: No successors + ; IF-EVL-NEXT: } + + entry: + br label %for.body + + for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx3, align 4 + %cmp4 = icmp sgt i32 %0, %1 + %2 = sub i32 0, %1 + %cond.p = select i1 %cmp4, i32 %1, i32 %2 + %cond = add i32 %cond.p, %0 + %arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %cond, ptr %arrayidx15, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + + exit: + ret void + } diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 8e5bf27acc64f1b1b92d15cb1fe2aa4708a37f33..73647919aac3602daa81152eae929f19a3968b5a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -858,10 +858,6 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 4 ; CHECK-NEXT: [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -879,7 +875,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK: loop: ; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 ; CHECK-NEXT: [[L_AND:%.*]] = and i32 [[L]], 3 ; CHECK-NEXT: store i32 [[L_AND]], ptr [[DST]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll index 9b49d44141db39afb48bfe04c2d5c2dd82a8baaa..1af03e740ef1abb6f58efeb0e98cfa3254318fd5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll @@ -36,7 +36,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; IF-EVL-NEXT: No successors ; IF-EVL-NEXT: } @@ -63,7 +63,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; NO-VP-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; NO-VP-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[ADD]]> -; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-NEXT: EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> ; NO-VP-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; NO-VP-NEXT: No successors ; NO-VP-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index 968058134690bcc68f0990ca3a31646198a71742..d25b03943fa217a531269cec97b90326ead37fb0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -153,7 +153,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.body: ; ENABLED_MASKED_STRIDED-NEXT: [[IX_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1016, [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_09]], [[CONV]] +; ENABLED_MASKED_STRIDED-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_09]], [[CONV]] ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; ENABLED_MASKED_STRIDED: if.then: ; ENABLED_MASKED_STRIDED-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_09]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll index aae9393bbe0dd2c28fca077e2affc0d50b3bdd7f..8b5dd4211d8505157d62de52ca422a390f911a59 100644 --- a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll @@ -159,9 +159,6 @@ define void @dead_load_and_vector_pointer(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD2]], ; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr [[TMP4]], align 4, !alias.scope [[META6]], !noalias [[META9]] ; CHECK-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP5]], align 4, !alias.scope [[META6]], !noalias [[META9]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll new file mode 100644 index 0000000000000000000000000000000000000000..21433477c1d7a301b4284350b047679fc76daadf --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll @@ -0,0 +1,542 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; REQUIRES: asserts +; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s + +declare void @init_mem(ptr, i64); + +; == SOME LEGAL EXAMPLES == + +; The form of the induction variables requires SCEV predicates. +define i32 @diff_exit_block_needs_scev_check(i32 %end) { +; CHECK-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' +; CHECK: Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) +; CHECK-NEXT: LV: We can vectorize this loop! +; CHECK-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +entry: + %p1 = alloca [1024 x i32] + %p2 = alloca [1024 x i32] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + %end.clamped = and i32 %end, 1023 + br label %for.body + +for.body: + %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ] + %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ] + %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind + %0 = load i32, ptr %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind + %1 = load i32, ptr %arrayidx2, align 4 + %cmp.early = icmp eq i32 %0, %1 + br i1 %cmp.early, label %found, label %for.inc + +for.inc: + %ind.next = add i8 %ind, 1 + %conv = zext i8 %ind.next to i32 + %gep.ind.next = add i64 %gep.ind, 1 + %cmp = icmp ult i32 %conv, %end.clamped + br i1 %cmp, label %for.body, label %exit + +found: + ret i32 1 + +exit: + ret i32 0 +} + + +define i64 @same_exit_block_pre_inc_use1() { +; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' +; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 +; CHECK-NEXT: LV: We can vectorize this loop! +; CHECK-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_safe_call() { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_safe_call' +; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 +; CHECK-NEXT: LV: We can vectorize this loop! +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index + %ld1 = load float, ptr %arrayidx, align 1 + %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1) + %cmp = fcmp fast ult float %sqrt, 3.0e+00 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_safe_div() { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_safe_div' +; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 +; CHECK-NEXT: LV: We can vectorize this loop! +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %div = udiv i32 %ld1, 20000 + %cmp = icmp eq i32 %div, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' +; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 +; CHECK-NEXT: LV: We can vectorize this loop! +; CHECK-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %cmp = icmp eq i32 %ld1, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index + %ld2 = load i64, ptr %arrayidx2, align 8 + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ] + ret i64 %retval +} + + +; == SOME ILLEGAL EXAMPLES == + + +define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas' +; CHECK: LV: Not vectorizing: Loop may fault. +entry: + %p1 = alloca [42 x i8] + %p2 = alloca [42 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) { +; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_deref_ptrs' +; CHECK: LV: Not vectorizing: Loop may fault. +entry: + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) { +; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_unknown_ptrs' +; CHECK: LV: Not vectorizing: Loop may fault. +entry: + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't +; support this yet. +define i64 @uncountable_exit_on_last_block() { +; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_on_last_block' +; CHECK: LV: Not vectorizing: Early exit is not the latch predecessor. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %search ], [ 3, %entry ] + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end, label %loop + +loop.end: + %retval = phi i64 [ 64, %loop ], [ %index, %search ] + ret i64 %retval +} + + +; We don't currently support multiple uncountable early exits. +define i64 @multiple_uncountable_exits() { +; CHECK-LABEL: LV: Checking a loop in 'multiple_uncountable_exits' +; CHECK: LV: Not vectorizing: Loop has too many uncountable exits. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %search1 + +search1: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp1 = icmp eq i8 %ld1, %ld2 + br i1 %cmp1, label %loop.end, label %search2 + +search2: + %cmp2 = icmp ult i8 %ld1, 34 + br i1 %cmp2, label %loop.end, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %search1, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ] + ret i64 %retval +} + + +define i64 @uncountable_exit_infinite_loop() { +; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_infinite_loop' +; CHECK: LV: Not vectorizing: Cannot determine exact exit count for latch block. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br label %loop + +loop.end: + %retval = phi i64 [ %index, %loop ] + ret i64 %retval +} + + +define i64 @loop_contains_unsafe_call() { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_unsafe_call' +; CHECK: LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %bad_call = call i32 @foo(i32 %ld1) #0 + %cmp = icmp eq i32 %bad_call, 34 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_unsafe_div() { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_unsafe_div' +; CHECK: LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %div = udiv i32 20000, %ld1 + %cmp = icmp eq i32 %div, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_store(ptr %dest) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store' +; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index + store i32 %ld1, ptr %arrayidx2, align 4 + %cmp = icmp eq i32 %ld1, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @uncountable_exit_in_conditional_block(ptr %mask) { +; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_in_conditional_block' +; CHECK: LV: Not vectorizing: Early exit is not the latch predecessor. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index + %ld1 = load i8, ptr %arrayidx1, align 1 + %cmp1 = icmp ne i8 %ld1, 0 + br i1 %cmp1, label %loop.search, label %loop.inc + +loop.search: + %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index + %ld2 = load i8, ptr %arrayidx2, align 1 + %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld3 = load i8, ptr %arrayidx3, align 1 + %cmp2 = icmp eq i8 %ld2, %ld3 + br i1 %cmp2, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_with_reduction() { +; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_with_reduction' +; CHECK: LV: Not vectorizing: Found reductions or recurrences in early-exit loop. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %ld2.zext = zext i8 %ld2 to i64 + %red.next = add i64 %red, %ld2.zext + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + %retval = add i64 %red.next, %final.ind + ret i64 %retval +} + + +define i64 @uncountable_exit_has_multiple_outside_successors() { +; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_has_multiple_outside_successors' +; CHECK: LV: Not vectorizing: Loop contains an unsupported switch +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + switch i8 %ld1, label %loop.inc [ + i8 2, label %loop.end + i8 3, label %loop.surprise + ] + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.surprise: + ret i64 3 + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +declare i32 @foo(i32) readonly +declare @foo_vec() + +attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll index 5e4ea2c0bfc508597a7c8e753c316af093261c33..9de675b2853097325e7eeceed4ed522dc1083cb9 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -34,7 +34,7 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1> -; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]> = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -44,11 +44,11 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> -; CHECK-NEXT: EMIT vp<[[RESUME_2_P:%.*]]> = resume-phi vp<[[RESUME_2]]>, ir<33> +; CHECK-NEXT: EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33> ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_1_P]]> -; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]> +; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>.1 ; CHECK-NEXT: } ; entry: @@ -105,8 +105,8 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1> -; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]> = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[RESUME_3:%.+]]> = extract-from-end vp<[[FOR2_SPLICE]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[RESUME_3:%.+]]>.2 = extract-from-end vp<[[FOR2_SPLICE]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -116,13 +116,13 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> -; CHECK-NEXT: EMIT vp<[[RESUME_2_P:%.*]]> = resume-phi vp<[[RESUME_2]]>, ir<33> -; CHECK-NEXT: EMIT vp<[[RESUME_3_P:%.*]]> = resume-phi vp<[[RESUME_3]]>, ir<33> +; CHECK-NEXT: EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33> +; CHECK-NEXT: EMIT vp<[[RESUME_3_P:%.*]]>.2 = resume-phi vp<[[RESUME_3]]>.2, ir<33> ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_1_P]]> -; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]> -; CHECK-NEXT: Live-out i16 %for.3 = vp<[[RESUME_3_P]]> +; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>.1 +; CHECK-NEXT: Live-out i16 %for.3 = vp<[[RESUME_3_P]]>.2 ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll index 953c93756fef45c1a4c188dab93de1f6995abd83..5913bae082f1dab4cdabc09f23e9db766377176e 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll @@ -154,6 +154,92 @@ loop.latch: exit: ret void } + +; FIXME: Currently the start address of the interleav group is computed +; incorrectly. +define i64 @interleave_group_load_pointer_type(ptr %start, ptr %end) { +; CHECK-LABEL: define i64 @interleave_group_load_pointer_type( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] +; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 24 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[N_VEC]], 24 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 24 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 -8 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x ptr>, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint <4 x ptr> [[STRIDED_VEC3]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint <4 x ptr> [[STRIDED_VEC]] to <4 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12]] = or <4 x i64> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP12]]) +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_16:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 16 +; CHECK-NEXT: [[L_16:%.*]] = load ptr, ptr [[GEP_16]], align 8 +; CHECK-NEXT: [[P_16:%.*]] = ptrtoint ptr [[L_16]] to i64 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 8 +; CHECK-NEXT: [[L_8:%.*]] = load ptr, ptr [[GEP_8]], align 8 +; CHECK-NEXT: [[P_8:%.*]] = ptrtoint ptr [[L_8]] to i64 +; CHECK-NEXT: [[OR_1:%.*]] = or i64 [[P_16]], [[P_8]] +; CHECK-NEXT: [[RED_NEXT]] = or i64 [[OR_1]], [[RED]] +; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 24 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %gep.16 = getelementptr i8, ptr %ptr.iv, i64 16 + %l.16 = load ptr, ptr %gep.16, align 8 + %p.16 = ptrtoint ptr %l.16 to i64 + %gep.8 = getelementptr i8, ptr %ptr.iv, i64 8 + %l.8 = load ptr, ptr %gep.8, align 8 + %p.8 = ptrtoint ptr %l.8 to i64 + %or.1 = or i64 %p.16, %p.8 + %red.next = or i64 %or.1, %red + %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 24 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %red.next +} ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -161,4 +247,6 @@ exit: ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index 2878786cb972a5186224661d5ce2caf7b5b4e1a4..54c08b47598e0fa376abcd087ba6101572672b44 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -113,7 +113,7 @@ define void @test_struct_array_load3_store3() { ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> @@ -268,7 +268,7 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], @@ -443,7 +443,7 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] ; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 1022 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll new file mode 100644 index 0000000000000000000000000000000000000000..94af5b7c7607dab635d4923042177bf6b2773e27 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S < %s -p loop-vectorize | FileCheck %s + +declare void @init_mem(ptr, i64); + +define i64 @one_uncountable_two_countable_same_exit_phi_of_consts() { +; CHECK-LABEL: define i64 @one_uncountable_two_countable_same_exit_phi_of_consts() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]] +; CHECK: search: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[SEARCH]] ], [ 0, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %cmp1 = icmp ne i64 %index, 64 + br i1 %cmp1, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 128 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ 0, %loop ], [ 1, %search ], [ 0, %loop.inc ] + ret i64 %retval +} + + +define i64 @one_uncountable_two_countable_diff_exit_no_phis() { +; CHECK-LABEL: define i64 @one_uncountable_two_countable_diff_exit_no_phis() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]] +; CHECK: search: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end.early: +; CHECK-NEXT: ret i64 1 +; CHECK: loop.end: +; CHECK-NEXT: ret i64 0 +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %cmp1 = icmp ne i64 %index, 64 + br i1 %cmp1, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end.early, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 128 + br i1 %exitcond, label %loop, label %loop.end + +loop.end.early: + ret i64 1 + +loop.end: + ret i64 0 +} diff --git a/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll new file mode 100644 index 0000000000000000000000000000000000000000..7759c10032e9bd0ba26b55f9724c18525d5d9241 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S < %s -p loop-vectorize | FileCheck %s + +declare void @init_mem(ptr, i64); + +; There are multiple exit blocks - two of them have an exact representation for the +; exit-not-taken counts and the other is unknown, i.e. the "early exit". +define i64 @one_uncountable_two_countable_same_exit() { +; CHECK-LABEL: define i64 @one_uncountable_two_countable_same_exit() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]] +; CHECK: search: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %cmp1 = icmp ne i64 %index, 64 + br i1 %cmp1, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 128 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ 64, %loop ], [ %index, %search ], [ 128, %loop.inc ] + ret i64 %retval +} + + +define i64 @one_uncountable_two_countable_diff_exit() { +; CHECK-LABEL: define i64 @one_uncountable_two_countable_diff_exit() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]] +; CHECK: search: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end.early: +; CHECK-NEXT: [[RET_EARLY:%.*]] = phi i64 [ [[INDEX]], [[SEARCH]] ] +; CHECK-NEXT: ret i64 [[RET_EARLY]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ 128, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %cmp1 = icmp ne i64 %index, 64 + br i1 %cmp1, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end.early, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 128 + br i1 %exitcond, label %loop, label %loop.end + +loop.end.early: + %ret.early = phi i64 [ %index, %search ] + ret i64 %ret.early + +loop.end: + %retval = phi i64 [ 64, %loop ], [ 128, %loop.inc ] + ret i64 %retval +} diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll new file mode 100644 index 0000000000000000000000000000000000000000..52f82d007de4dfa24808ac79104a8a2238cd82fd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -0,0 +1,220 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S < %s -p loop-vectorize | FileCheck %s + +declare void @init_mem(ptr, i64); + + +define i64 @same_exit_block_phi_of_consts() { +; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ] + ret i64 %retval +} + + +define i64 @diff_exit_block_phi_of_consts() { +; CHECK-LABEL: define i64 @diff_exit_block_phi_of_consts() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK: loop.early.exit: +; CHECK-NEXT: ret i64 0 +; CHECK: loop.end: +; CHECK-NEXT: ret i64 1 +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.early.exit + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.early.exit: + ret i64 0 + +loop.end: + ret i64 1 +} + + +; The form of the induction variables requires SCEV predicates. +define i32 @diff_exit_block_needs_scev_check(i32 %end) { +; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check( +; CHECK-SAME: i32 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END]], 1023 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 +; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: found: +; CHECK-NEXT: ret i32 1 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %p1 = alloca [1024 x i32] + %p2 = alloca [1024 x i32] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + %end.clamped = and i32 %end, 1023 + br label %for.body + +for.body: + %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ] + %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ] + %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind + %0 = load i32, ptr %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind + %1 = load i32, ptr %arrayidx2, align 4 + %cmp.early = icmp eq i32 %0, %1 + br i1 %cmp.early, label %found, label %for.inc + +for.inc: + %ind.next = add i8 %ind, 1 + %conv = zext i8 %ind.next to i32 + %gep.ind.next = add i64 %gep.ind, 1 + %cmp = icmp ult i32 %conv, %end.clamped + br i1 %cmp, label %for.body, label %exit + +found: + ret i32 1 + +exit: + ret i32 0 +} + + +declare void @abort() + +; This is a variant of an early exit loop where the condition for leaving +; early is loop invariant. +define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { +; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SVAL:%.*]] = load i32, ptr [[S]], align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[SVAL]], 0 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]] +; CHECK: for.inc: +; CHECK-NEXT: [[IND_NEXT]] = add nsw i32 [[IND]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: early.exit: +; CHECK-NEXT: tail call void @abort() +; CHECK-NEXT: unreachable +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + %sval = load i32, ptr %s, align 4 + %cond = icmp eq i32 %sval, 0 + br label %for.body + +for.body: + %ind = phi i32 [ -10, %entry ], [ %ind.next, %for.inc ] + br i1 %cond, label %for.inc, label %early.exit + +for.inc: + %ind.next = add nsw i32 %ind, 1 + %exitcond.not = icmp eq i32 %ind.next, 266 + br i1 %exitcond.not, label %for.end, label %for.body + +early.exit: + tail call void @abort() + unreachable + +for.end: + ret i32 0 +} diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll similarity index 50% rename from llvm/test/Transforms/LoopVectorize/simple_early_exit.ll rename to llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index 49454ae18db79d21859b28ee3c0a3239b49be2d3..7889191c4b5baec5356b6493b871325ce3098a35 100644 --- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -1,15 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; REQUIRES: asserts -; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize 2>%t | FileCheck %s --check-prefixes=CHECK -; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt -S < %s -p loop-vectorize | FileCheck %s declare void @init_mem(ptr, i64); define i64 @same_exit_block_pre_inc_use1() { -; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' -; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 -; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 @@ -60,6 +54,60 @@ loop.end: } +define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc1_use_inv_cond( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false +; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + %cmp4 = select i1 %cond, i1 %cmp3, i1 false + br i1 %cmp4, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { ; CHECK-NEXT: entry: @@ -212,6 +260,7 @@ loop.end: ret i64 %retval } + define i64 @same_exit_block_pre_inc_use3() { ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use3() { ; CHECK-NEXT: entry: @@ -311,7 +360,6 @@ loop.end: } - define i64 @same_exit_block_post_inc_use() { ; CHECK-LABEL: define i64 @same_exit_block_post_inc_use() { ; CHECK-NEXT: entry: @@ -362,6 +410,7 @@ loop.end: ret i64 %retval } + define i64 @same_exit_block_post_inc_use2() { ; CHECK-LABEL: define i64 @same_exit_block_post_inc_use2() { ; CHECK-NEXT: entry: @@ -412,56 +461,6 @@ loop.end: ret i64 %retval } -define i64 @same_exit_block_phi_of_consts() { -; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ] - ret i64 %retval -} - define i64 @diff_exit_block_pre_inc_use1() { ; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use1() { @@ -520,6 +519,7 @@ loop.end: ret i64 %retval2 } + define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: entry: @@ -577,6 +577,7 @@ loop.end: ret i64 %retval2 } + define i64 @diff_exit_block_pre_inc_use3() { ; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use3() { ; CHECK-NEXT: entry: @@ -633,8 +634,8 @@ loop.end: } -define i64 @diff_exit_block_phi_of_consts() { -; CHECK-LABEL: define i64 @diff_exit_block_phi_of_consts() { +define i64 @diff_exit_block_post_inc_use1() { +; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use1() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 @@ -654,9 +655,11 @@ define i64 @diff_exit_block_phi_of_consts() { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] ; CHECK: loop.early.exit: -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: ret i64 1 +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: %p1 = alloca [1024 x i8] @@ -680,15 +683,17 @@ loop.inc: br i1 %exitcond, label %loop, label %loop.end loop.early.exit: - ret i64 0 + %retval1 = phi i64 [ %index, %loop ] + ret i64 %retval1 loop.end: - ret i64 1 + %retval2 = phi i64 [ %index.next, %loop.inc ] + ret i64 %retval2 } -define i64 @diff_exit_block_post_inc_use1() { -; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use1() { +define i64 @diff_exit_block_post_inc_use2() { +; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use2() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 @@ -697,6 +702,7 @@ define i64 @diff_exit_block_post_inc_use1() { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -704,14 +710,13 @@ define i64 @diff_exit_block_post_inc_use1() { ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] ; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -723,6 +728,7 @@ entry: loop: %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %index.next = add i64 %index, 1 %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index %ld1 = load i8, ptr %arrayidx, align 1 %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index @@ -731,22 +737,21 @@ loop: br i1 %cmp3, label %loop.inc, label %loop.early.exit loop.inc: - %index.next = add i64 %index, 1 %exitcond = icmp ne i64 %index.next, 67 br i1 %exitcond, label %loop, label %loop.end loop.early.exit: - %retval1 = phi i64 [ %index, %loop ] + %retval1 = phi i64 [ %index.next, %loop ] ret i64 %retval1 loop.end: - %retval2 = phi i64 [ %index.next, %loop.inc ] + %retval2 = phi i64 [ %index, %loop.inc ] ret i64 %retval2 } -define i64 @diff_exit_block_post_inc_use2() { -; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use2() { +define i64 @loop_contains_safe_call() { +; CHECK-LABEL: define i64 @loop_contains_safe_call() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 @@ -755,22 +760,18 @@ define i64 @diff_exit_block_post_inc_use2() { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] ; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] -; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ] -; CHECK-NEXT: ret i64 [[RETVAL1]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL2]] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: %p1 = alloca [1024 x i8] @@ -781,55 +782,45 @@ entry: loop: %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %index.next = add i64 %index, 1 - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.inc, label %loop.early.exit + %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index + %ld1 = load float, ptr %arrayidx, align 1 + %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1) + %cmp = fcmp fast ult float %sqrt, 3.0e+00 + br i1 %cmp, label %loop.inc, label %loop.end loop.inc: + %index.next = add i64 %index, 1 %exitcond = icmp ne i64 %index.next, 67 br i1 %exitcond, label %loop, label %loop.end -loop.early.exit: - %retval1 = phi i64 [ %index.next, %loop ] - ret i64 %retval1 - loop.end: - %retval2 = phi i64 [ %index, %loop.inc ] - ret i64 %retval2 + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval } -; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't -; support this yet. -define i64 @early_exit_on_last_block() { -; DEBUG-LABEL: LV: Checking a loop in 'early_exit_on_last_block' -; DEBUG: LV: Not vectorizing: Early exit is not the latch predecessor. -; CHECK-LABEL: define i64 @early_exit_on_last_block() { +define i64 @loop_contains_safe_div() { +; CHECK-LABEL: define i64 @loop_contains_safe_div() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[SEARCH:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH]], label [[FOR_END_LOOPEXIT:%.*]] -; CHECK: search: -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]] -; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[LAND_RHS]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX2]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] ; CHECK: loop.end: -; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ] -; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: %p1 = alloca [1024 x i8] @@ -839,642 +830,88 @@ entry: br label %loop loop: - %index = phi i64 [ %index.next, %search ], [ 3, %entry ] + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %div = udiv i32 %ld1, 20000 + %cmp = icmp eq i32 %div, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: %index.next = add i64 %index, 1 %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %search, label %loop.end - -search: - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.end, label %loop + br i1 %exitcond, label %loop, label %loop.end loop.end: - %retval = phi i64 [ 64, %loop ], [ %index, %search ] + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] ret i64 %retval } -; There are multiple exit blocks - two of them have an exact representation for the -; exit-not-taken counts and the other is unknown, i.e. the "early exit". -define i64 @multiple_exits_one_early() { -; CHECK-LABEL: define i64 @multiple_exits_one_early() { +define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) { +; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit( +; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64 -; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]] -; CHECK: search: -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] ; CHECK: loop.inc: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) br label %loop loop: %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %cmp1 = icmp ne i64 %index, 64 - br i1 %cmp1, label %search, label %loop.end - -search: - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.end, label %loop.inc + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %cmp = icmp eq i32 %ld1, 1 + br i1 %cmp, label %loop.inc, label %loop.end loop.inc: + %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index + %ld2 = load i64, ptr %arrayidx2, align 8 %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 128 + %exitcond = icmp ne i64 %index.next, 67 br i1 %exitcond, label %loop, label %loop.end loop.end: - %retval = phi i64 [ 64, %loop ], [ %index, %search ], [ 128, %loop.inc ] + %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ] ret i64 %retval } -; We don't currently support multiple early exits. -define i64 @multiple_early_exits() { -; DEBUG-LABEL: LV: Checking a loop in 'multiple_early_exits' -; DEBUG: LV: Not vectorizing: Loop has too many uncountable exits. -; CHECK-LABEL: define i64 @multiple_early_exits() { +define i64 @same_exit_block_pre_inc_use1_reverse() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LAND_RHS:%.*]] -; CHECK: search1: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]] -; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC:%.*]] -; CHECK: search2: -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i8 [[TMP41]], 34 -; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_END_LOOPEXIT]], label [[FOR_INC1]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] -; CHECK: loop.end: -; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 100, [[FOR_INC]] ], [ 43, [[FOR_INC1]] ] -; CHECK-NEXT: ret i64 [[START_0_LCSSA]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %search1 - -search1: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp1 = icmp eq i8 %ld1, %ld2 - br i1 %cmp1, label %loop.end, label %search2 - -search2: - %cmp2 = icmp ult i8 %ld1, 34 - br i1 %cmp2, label %loop.end, label %loop.inc - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %search1, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ] - ret i64 %retval -} - - -define i64 @early_exit_infinite_loop() { -; DEBUG-LABEL: LV: Checking a loop in 'early_exit_infinite_loop' -; DEBUG: LV: Not vectorizing: Cannot determine exact exit count for latch block. -; CHECK-LABEL: define i64 @early_exit_infinite_loop() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LAND_RHS:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] -; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br label [[LAND_RHS]] -; CHECK: loop.end: -; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ] -; CHECK-NEXT: ret i64 [[START_0_LCSSA]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br label %loop - -loop.end: - %retval = phi i64 [ %index, %loop ] - ret i64 %retval -} - - -define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use_inv_cond( -; CHECK-SAME: i1 [[COND:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false -; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - %cmp4 = select i1 %cond, i1 %cmp3, i1 false - br i1 %cmp4, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @loop_contains_safe_call() { -; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_call' -; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 -; DEBUG-NEXT: LV: We can vectorize this loop! -; CHECK-LABEL: define i64 @loop_contains_safe_call() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index - %ld1 = load float, ptr %arrayidx, align 1 - %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1) - %cmp = fcmp fast ult float %sqrt, 3.0e+00 - br i1 %cmp, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @loop_contains_unsafe_call() { -; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_unsafe_call' -; DEBUG: LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed. -; CHECK-LABEL: define i64 @loop_contains_unsafe_call() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR2:[0-9]+]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index - %ld1 = load i32, ptr %arrayidx, align 1 - %bad_call = call i32 @foo(i32 %ld1) #0 - %cmp = icmp eq i32 %bad_call, 34 - br i1 %cmp, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @loop_contains_safe_div() { -; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_div' -; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 -; DEBUG-NEXT: LV: We can vectorize this loop! -; CHECK-LABEL: define i64 @loop_contains_safe_div() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP1:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX2]] -; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index - %ld1 = load i32, ptr %arrayidx, align 1 - %div = udiv i32 %ld1, 20000 - %cmp = icmp eq i32 %div, 1 - br i1 %cmp, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @loop_contains_unsafe_div() { -; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_unsafe_div' -; DEBUG: LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed. -; CHECK-LABEL: define i64 @loop_contains_unsafe_div() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[DIV:%.*]] = udiv i32 20000, [[LD1]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i32, ptr %arrayidx, align 1 - %div = udiv i32 20000, %ld1 - %cmp = icmp eq i32 %div, 1 - br i1 %cmp, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @loop_contains_store(ptr %dest) { -; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_store' -; DEBUG: LV: Not vectorizing: Writes to memory unsupported in early exit loops -; CHECK-LABEL: define i64 @loop_contains_store( -; CHECK-SAME: ptr [[DEST:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[LD1]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index - %ld1 = load i32, ptr %arrayidx, align 1 - %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index - store i32 %ld1, ptr %arrayidx2, align 4 - %cmp = icmp eq i32 %ld1, 1 - br i1 %cmp, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) { -; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' -; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 -; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. -; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit( -; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index - %ld1 = load i32, ptr %arrayidx, align 1 - %cmp = icmp eq i32 %ld1, 1 - br i1 %cmp, label %loop.inc, label %loop.end - -loop.inc: - %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index - %ld2 = load i64, ptr %arrayidx2, align 8 - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ] - ret i64 %retval -} - - -define i64 @early_exit_in_conditional_block(ptr %mask) { -; DEBUG-LABEL: LV: Checking a loop in 'early_exit_in_conditional_block' -; DEBUG: LV: Not vectorizing: Early exit is not the latch predecessor. -; CHECK-LABEL: define i64 @early_exit_in_conditional_block( -; CHECK-SAME: ptr [[MASK:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[MASK]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i8 [[LD1]], 0 -; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP_SEARCH:%.*]], label [[LOOP_INC]] -; CHECK: loop.search: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[LD2]], [[LD3]] -; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP_SEARCH]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index - %ld1 = load i8, ptr %arrayidx1, align 1 - %cmp1 = icmp ne i8 %ld1, 0 - br i1 %cmp1, label %loop.search, label %loop.inc - -loop.search: - %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index - %ld2 = load i8, ptr %arrayidx2, align 1 - %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld3 = load i8, ptr %arrayidx3, align 1 - %cmp2 = icmp eq i8 %ld2, %ld3 - br i1 %cmp2, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @same_exit_block_pre_inc_use1_reverse() { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 @@ -1510,68 +947,6 @@ loop.end: } -define i64 @same_exit_block_pre_inc_use1_with_reduction() { -; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_with_reduction' -; DEBUG: LV: Not vectorizing: Found reductions or recurrences in early-exit loop. -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_with_reduction() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LAND_RHS:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[RED_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[LD2_ZEXT:%.*]] = zext i8 [[TMP39]] to i64 -; CHECK-NEXT: [[RED_NEXT]] = add i64 [[RED]], [[LD2_ZEXT]] -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] -; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] -; CHECK: loop.end: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[FOR_INC]] ], [ [[RED_NEXT]], [[LAND_RHS]] ] -; CHECK-NEXT: [[FINAL_IND:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ] -; CHECK-NEXT: [[START_0_LCSSA:%.*]] = add i64 [[RED_NEXT_LCSSA]], [[FINAL_IND]] -; CHECK-NEXT: ret i64 [[START_0_LCSSA]] -; -entry: - %p1 = alloca [1024 x i8] - %p2 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %ld2.zext = zext i8 %ld2 to i64 - %red.next = add i64 %red, %ld2.zext - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - %retval = add i64 %red.next, %final.ind - ret i64 %retval -} - - define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) { ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs( ; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) { @@ -1616,320 +991,6 @@ loop.end: } -; The form of the induction variables requires SCEV predicates. -define i32 @diff_exit_block_needs_scev_check(i32 %end) { -; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' -; DEBUG: Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) -; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. -; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check( -; CHECK-SAME: i32 [[END:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i32], align 4 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i32], align 4 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END]], 1023 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 -; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]] -; CHECK: found: -; CHECK-NEXT: ret i32 1 -; CHECK: exit: -; CHECK-NEXT: ret i32 0 -; -entry: - %p1 = alloca [1024 x i32] - %p2 = alloca [1024 x i32] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - %end.clamped = and i32 %end, 1023 - br label %for.body - -for.body: - %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ] - %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ] - %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind - %0 = load i32, ptr %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind - %1 = load i32, ptr %arrayidx2, align 4 - %cmp.early = icmp eq i32 %0, %1 - br i1 %cmp.early, label %found, label %for.inc - -for.inc: - %ind.next = add i8 %ind, 1 - %conv = zext i8 %ind.next to i32 - %gep.ind.next = add i64 %gep.ind, 1 - %cmp = icmp ult i32 %conv, %end.clamped - br i1 %cmp, label %for.body, label %exit - -found: - ret i32 1 - -exit: - ret i32 0 -} - - -declare void @abort() - -; This is a variant of an early exit loop where the condition for leaving -; early is loop invariant. -define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { -; DEBUG-LABEL: LV: Checking a loop in 'diff_blocks_invariant_early_exit_cond' -; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 275 -; DEBUG: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. -; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond( -; CHECK-SAME: ptr [[S:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SVAL:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[SVAL]], 0 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]] -; CHECK: for.inc: -; CHECK-NEXT: [[IND_NEXT]] = add nsw i32 [[IND]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; CHECK: early.exit: -; CHECK-NEXT: tail call void @abort() -; CHECK-NEXT: unreachable -; CHECK: for.end: -; CHECK-NEXT: ret i32 0 -; -entry: - %sval = load i32, ptr %s, align 4 - %cond = icmp eq i32 %sval, 0 - br label %for.body - -for.body: - %ind = phi i32 [ -10, %entry ], [ %ind.next, %for.inc ] - br i1 %cond, label %for.inc, label %early.exit - -for.inc: - %ind.next = add nsw i32 %ind, 1 - %exitcond.not = icmp eq i32 %ind.next, 266 - br i1 %exitcond.not, label %for.end, label %for.body - -early.exit: - tail call void @abort() - unreachable - -for.end: - ret i32 0 -} - - -define i64 @early_exit_has_multiple_outside_successors() { -; DEBUG-LABEL: LV: Checking a loop in 'early_exit_has_multiple_outside_successors' -; DEBUG: LV: Not vectorizing: Loop contains an unsupported switch -; CHECK-LABEL: define i64 @early_exit_has_multiple_outside_successors() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: switch i8 [[LD1]], label [[LOOP_INC]] [ -; CHECK-NEXT: i8 2, label [[LOOP_END:%.*]] -; CHECK-NEXT: i8 3, label [[LOOP_SURPRISE:%.*]] -; CHECK-NEXT: ] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.surprise: -; CHECK-NEXT: ret i64 3 -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [1024 x i8] - call void @init_mem(ptr %p1, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - switch i8 %ld1, label %loop.inc [ - i8 2, label %loop.end - i8 3, label %loop.surprise - ] - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.surprise: - ret i64 3 - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { -; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas' -; DEBUG: LV: Not vectorizing: Loop may fault. -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - %p1 = alloca [42 x i8] - %p2 = alloca [42 x i8] - call void @init_mem(ptr %p1, i64 1024) - call void @init_mem(ptr %p2, i64 1024) - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( -; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - -define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( -; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] -; -entry: - br label %loop - -loop: - %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] - %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index - %ld1 = load i8, ptr %arrayidx, align 1 - %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index - %ld2 = load i8, ptr %arrayidx1, align 1 - %cmp3 = icmp eq i8 %ld1, %ld2 - br i1 %cmp3, label %loop.inc, label %loop.end - -loop.inc: - %index.next = add i64 %index, 1 - %exitcond = icmp ne i64 %index.next, 67 - br i1 %exitcond, label %loop, label %loop.end - -loop.end: - %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] - ret i64 %retval -} - - - declare i32 @foo(i32) readonly declare @foo_vec() diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll new file mode 100644 index 0000000000000000000000000000000000000000..c68eeac19c9ecfd1afd7c2a1d22b35bdd1064fc2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S < %s -p loop-vectorize | FileCheck %s + +declare void @init_mem(ptr, i64); + + +define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [42 x i8] + %p2 = alloca [42 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( +; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} diff --git a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll index c63dc9979bce3c10eda87d93c09182422072093b..1b646200e1c7c5d297aa69691e1d65f59b53a9e9 100644 --- a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll +++ b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll @@ -463,7 +463,8 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(ptr %dst, i64 %N ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 42, [[N]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 42, [[TMP10]] ; CHECK-NEXT: [[TMP8:%.*]] = freeze i64 [[TMP0]] ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP8]], i64 [[SMAX]]) @@ -526,11 +527,160 @@ exit: ret i64 %p } +declare void @foo() + +define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop(ptr %dst, i64 %N) { +; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 42, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i64 [[TMP2]] +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]]) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[D:%.*]] = udiv i64 42, [[N]] +; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i64 [[P]] +; +entry: + call void @foo() + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 1, ptr %gep + %c.0 = icmp slt i64 %iv, %N + br i1 %c.0, label %loop.latch, label %exit + +loop.latch: + %iv.next = add i64 %iv, 1 + %d = udiv i64 42, %N + %c.1 = icmp slt i64 %iv, %d + br i1 %c.1, label %loop.header, label %exit + +exit: + %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch] + ret i64 %p +} + +define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute(ptr %dst, i64 %N, i1 %c) { +; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: loop.header.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 42, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i64 [[TMP2]] +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]]) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_HEADER_PREHEADER]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[D:%.*]] = udiv i64 42, [[N]] +; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: exit.loopexit: +; CHECK-NEXT: [[P_PH:%.*]] = phi i64 [ 0, [[LOOP_LATCH]] ], [ 1, [[LOOP_HEADER]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[P:%.*]] = phi i64 [ 2, [[ENTRY:%.*]] ], [ [[P_PH]], [[EXIT_LOOPEXIT]] ] +; CHECK-NEXT: ret i64 [[P]] +; +entry: + br i1 %c, label %loop.header, label %exit + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 1, ptr %gep + %c.0 = icmp slt i64 %iv, %N + br i1 %c.0, label %loop.latch, label %exit + +loop.latch: + %iv.next = add i64 %iv, 1 + %d = udiv i64 42, %N + %c.1 = icmp slt i64 %iv, %d + br i1 %c.1, label %loop.header, label %exit + +exit: + %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch], [ 2, %entry ] + ret i64 %p +} + define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds(ptr %dst, i64 %N, i64 %M) { ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 42, [[M]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[M]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 42, [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = freeze i64 [[TMP2]] ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]]) @@ -551,7 +701,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds ; CHECK-NEXT: store <4 x i32> , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -567,7 +717,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[D:%.*]] = udiv i64 42, [[M]] ; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] ; CHECK-NEXT: ret i64 [[P]] @@ -593,16 +743,16 @@ exit: ret i64 %p } - define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst, i64 %N) { ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FR_N:%.*]] = freeze i64 [[N]] -; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 42, [[FR_N]] -; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[FR_N]], i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 42, [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = freeze i64 [[TMP2]] ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) -; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SMAX]]) +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]]) ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[UMIN]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -620,7 +770,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst, ; CHECK-NEXT: store <4 x i32> , ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -636,7 +786,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst, ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[D:%.*]] = udiv i64 42, [[FR_N]] ; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] ; CHECK-NEXT: ret i64 [[P]] @@ -688,7 +838,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_constant_in_latch(ptr %dst, i64 ; CHECK-NEXT: store <4 x i32> , ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -704,7 +854,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_constant_in_latch(ptr %dst, i64 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1 ; CHECK-NEXT: [[D:%.*]] = udiv i64 [[N]], 42 ; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV1]], [[D]] -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH]] ] ; CHECK-NEXT: ret i64 [[P]] @@ -750,7 +900,7 @@ define void @single_exit_tc_with_udiv(ptr %dst, i64 %N) { ; CHECK-NEXT: store <4 x i32> , ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -764,7 +914,7 @@ define void @single_exit_tc_with_udiv(ptr %dst, i64 %N) { ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[D:%.*]] = udiv i64 42, [[N]] ; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -788,7 +938,8 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 42, [[N]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 42, [[TMP12]] ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[N]], [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 42, [[TMP1]] ; CHECK-NEXT: [[SMAX1:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP2]], i64 0) @@ -812,7 +963,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N ; CHECK-NEXT: store <4 x i32> , ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -828,7 +979,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[D:%.*]] = urem i64 42, [[N]] ; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] ; CHECK-NEXT: ret i64 [[P]] @@ -879,7 +1030,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64 ; CHECK-NEXT: store <4 x i32> , ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -895,7 +1046,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1 ; CHECK-NEXT: [[D:%.*]] = urem i64 [[N]], 42 ; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV1]], [[D]] -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH]] ] ; CHECK-NEXT: ret i64 [[P]] @@ -1007,10 +1158,11 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 % ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 42, [[N]] -; CHECK-NEXT: [[TMP8:%.*]] = freeze i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[TMP9:%.*]] = udiv i64 42, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = freeze i64 [[TMP9]] ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) -; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP8]], i64 [[SMAX]]) +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]]) ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[UMIN]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -1028,7 +1180,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 % ; CHECK-NEXT: store <4 x i32> , ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1045,7 +1197,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 % ; CHECK-NEXT: [[D:%.*]] = udiv i64 42, [[N]] ; CHECK-NEXT: [[X:%.*]] = sub i64 100, [[D]] ; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] ; CHECK-NEXT: ret i64 [[P]] @@ -1072,6 +1224,120 @@ exit: ret i64 %p } +define i64 @multi_exit_exit_count_with_udiv_by_0_in_latch(ptr %dst, i64 %N) { +; CHECK-LABEL: define i64 @multi_exit_exit_count_with_udiv_by_0_in_latch( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[D:%.*]] = udiv i64 42, 0 +; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i64 [[P]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 1, ptr %gep + %c.0 = icmp slt i64 %iv, %N + br i1 %c.0, label %loop.latch, label %exit + +loop.latch: + %iv.next = add i64 %iv, 1 + %d = udiv i64 42, 0 + %c.1 = icmp slt i64 %iv, %d + br i1 %c.1, label %loop.header, label %exit + +exit: + %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch] + ret i64 %p +} + +define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison(ptr %dst, i64 %N, i64 %M) { +; CHECK-LABEL: define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[M_1:%.*]] = call i64 @llvm.umax.i64(i64 [[M]], i64 1) +; CHECK-NEXT: [[TMP9:%.*]] = freeze i64 [[M_1]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP9]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 42, [[TMP10]] +; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SMAX]]) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP7]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[D:%.*]] = udiv i64 42, [[M_1]] +; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i64 [[P]] +; +entry: + %M.1 = call i64 @llvm.umax.i64(i64 %M, i64 1) + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 1, ptr %gep + %c.0 = icmp slt i64 %iv, %N + br i1 %c.0, label %loop.latch, label %exit + +loop.latch: + %iv.next = add i64 %iv, 1 + %d = udiv i64 42, %M.1 + %c.1 = icmp slt i64 %iv, %d + br i1 %c.1, label %loop.header, label %exit + +exit: + %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch] + ret i64 %p +} + +declare i64 @llvm.umax.i64(i64, i64) + ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} @@ -1100,4 +1366,10 @@ exit: ; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} +; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} +; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} +; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll new file mode 100644 index 0000000000000000000000000000000000000000..cd91d07120f9ee5c39c380fef5375e6a51fcfe0e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll @@ -0,0 +1,494 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S < %s -p loop-vectorize | FileCheck %s + +declare void @init_mem(ptr, i64); + + +; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't +; support this yet. +define i64 @early_exit_on_last_block() { +; CHECK-LABEL: define i64 @early_exit_on_last_block() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[SEARCH:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: search: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[LAND_RHS]] +; CHECK: loop.end: +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %search ], [ 3, %entry ] + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end, label %loop + +loop.end: + %retval = phi i64 [ 64, %loop ], [ %index, %search ] + ret i64 %retval +} + + +; We don't currently support multiple early exits. +define i64 @multiple_uncountable_exits() { +; CHECK-LABEL: define i64 @multiple_uncountable_exits() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[SEARCH1:%.*]] +; CHECK: search1: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP_END:%.*]], label [[SEARCH2:%.*]] +; CHECK: search2: +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i8 [[LD1]], 34 +; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP_END]], label [[LOOP_INC]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[SEARCH1]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[SEARCH1]] ], [ 100, [[SEARCH2]] ], [ 43, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %search1 + +search1: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp1 = icmp eq i8 %ld1, %ld2 + br i1 %cmp1, label %loop.end, label %search2 + +search2: + %cmp2 = icmp ult i8 %ld1, 34 + br i1 %cmp2, label %loop.end, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %search1, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ] + ret i64 %retval +} + + +define i64 @uncountable_exit_infinite_loop() { +; CHECK-LABEL: define i64 @uncountable_exit_infinite_loop() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br label [[LOOP]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br label %loop + +loop.end: + %retval = phi i64 [ %index, %loop ] + ret i64 %retval +} + + +define i64 @loop_contains_unsafe_call() { +; CHECK-LABEL: define i64 @loop_contains_unsafe_call() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %bad_call = call i32 @foo(i32 %ld1) #0 + %cmp = icmp eq i32 %bad_call, 34 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_unsafe_div() { +; CHECK-LABEL: define i64 @loop_contains_unsafe_div() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 20000, [[LD1]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %div = udiv i32 20000, %ld1 + %cmp = icmp eq i32 %div, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_store(ptr %dest) { +; CHECK-LABEL: define i64 @loop_contains_store( +; CHECK-SAME: ptr [[DEST:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[LD1]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index + store i32 %ld1, ptr %arrayidx2, align 4 + %cmp = icmp eq i32 %ld1, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @uncountable_exit_in_conditional_block(ptr %mask) { +; CHECK-LABEL: define i64 @uncountable_exit_in_conditional_block( +; CHECK-SAME: ptr [[MASK:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[MASK]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i8 [[LD1]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP_SEARCH:%.*]], label [[LOOP_INC]] +; CHECK: loop.search: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[LD2]], [[LD3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP_SEARCH]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index + %ld1 = load i8, ptr %arrayidx1, align 1 + %cmp1 = icmp ne i8 %ld1, 0 + br i1 %cmp1, label %loop.search, label %loop.inc + +loop.search: + %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index + %ld2 = load i8, ptr %arrayidx2, align 1 + %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld3 = load i8, ptr %arrayidx3, align 1 + %cmp2 = icmp eq i8 %ld2, %ld3 + br i1 %cmp2, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_with_reduction() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_with_reduction() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[RED_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[LD2_ZEXT:%.*]] = zext i8 [[TMP39]] to i64 +; CHECK-NEXT: [[RED_NEXT]] = add i64 [[RED]], [[LD2_ZEXT]] +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] +; CHECK: loop.end: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[FOR_INC]] ], [ [[RED_NEXT]], [[LAND_RHS]] ] +; CHECK-NEXT: [[FINAL_IND:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ] +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = add i64 [[RED_NEXT_LCSSA]], [[FINAL_IND]] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %ld2.zext = zext i8 %ld2 to i64 + %red.next = add i64 %red, %ld2.zext + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + %retval = add i64 %red.next, %final.ind + ret i64 %retval +} + + +define i64 @uncountable_exit_has_multiple_outside_successors() { +; CHECK-LABEL: define i64 @uncountable_exit_has_multiple_outside_successors() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: switch i8 [[LD1]], label [[LOOP_INC]] [ +; CHECK-NEXT: i8 2, label [[LOOP_END:%.*]] +; CHECK-NEXT: i8 3, label [[LOOP_SURPRISE:%.*]] +; CHECK-NEXT: ] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.surprise: +; CHECK-NEXT: ret i64 3 +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + switch i8 %ld1, label %loop.inc [ + i8 2, label %loop.end + i8 3, label %loop.surprise + ] + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.surprise: + ret i64 3 + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +declare i32 @foo(i32) readonly +declare @foo_vec() + +attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } diff --git a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll index 3d2c2e5e9b5761b77a4478a3143ea13b409db913..7a5a8bfb1a99567e88b4b9a2912e6a1a457ddf4b 100644 --- a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll +++ b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll @@ -19,12 +19,6 @@ define i64 @foo(ptr %p1, ptr %p2, i64 %start, i64 %end) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[START2]], [[INDEX]] -; CHECK-NEXT: [[IND:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll index 5ea27994b356dae2ef48d07215e49f388479c125..27d81de260d3b941ee4fd12352baf4b664ead961 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll @@ -18,9 +18,9 @@ define void @test_unused_interleave(ptr %src, i32 %length) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%3> -; CHECK-NEXT: EMIT vp<%3> = add nuw vp<%2>, vp<%0> -; CHECK-NEXT: EMIT branch-on-count vp<%3>, vp<%1> +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%1> ; CHECK-NEXT: No successors ; CHECK-NEXT: } entry: diff --git a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll index 430baa1cb4f8c1d78528b0c6611f694acbdcdf69..5abdde9e0564e5c604db34076c9476e18efe836b 100644 --- a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll +++ b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll @@ -1,4 +1,5 @@ -; RUN: opt -passes=loop-versioning -S < %s | FileCheck %s -check-prefix=LV +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-versioning -S < %s | FileCheck %s ; NB: addrspaces 10-13 are non-integral target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13" @@ -12,40 +13,113 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13" declare i64 @julia_steprange_last_4949() -define void @"japi1_align!_9477"(ptr %arg) { -; LV-LAVEL: L26.lver.check -; LV: [[OFMul:%[^ ]*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[Step:%[^ ]*]]) -; LV-NEXT: [[OFMulResult:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 0 -; LV-NEXT: [[OFMulOverflow:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 1 -; LV: [[OFNegMulResult:%[^ ]*]] = sub i64 0, [[OFMulResult]] -; LV-NEXT: [[NegGEP:%[^ ]*]] = getelementptr i8, ptr addrspace(13) [[Base:%[^ ]*]], i64 [[OFNegMulResult]] -; LV-NEXT: icmp ugt ptr addrspace(13) [[NegGEP]], [[Base]] -; LV-NOT: inttoptr -; LV-NOT: ptrtoint +define void @wrapping_ptr_nonint_addrspace(ptr %arg) { +; CHECK-LABEL: define void @wrapping_ptr_nonint_addrspace( +; CHECK-SAME: ptr [[ARG:%.*]]) { +; CHECK-NEXT: [[LOOP_LVER_CHECK:.*:]] +; CHECK-NEXT: [[LOAD0:%.*]] = load ptr addrspace(10), ptr [[ARG]], align 8 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr inttoptr (i64 12 to ptr), align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[LOAD1]] +; CHECK-NEXT: [[CALL:%.*]] = call i64 @julia_steprange_last_4949() +; CHECK-NEXT: [[CAST0:%.*]] = addrspacecast ptr addrspace(10) [[LOAD0]] to ptr addrspace(11) +; CHECK-NEXT: [[LOAD2:%.*]] = load ptr addrspace(10), ptr addrspace(11) [[CAST0]], align 8 +; CHECK-NEXT: [[CAST1:%.*]] = addrspacecast ptr addrspace(10) [[LOAD2]] to ptr addrspace(11) +; CHECK-NEXT: [[LOAD3:%.*]] = load ptr addrspace(13), ptr addrspace(11) [[CAST1]], align 8 +; CHECK-NEXT: [[SEXT:%.*]] = sext i32 [[SUB]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[CALL]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[SEXT]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], -4 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr addrspace(13) [[SCEVGEP]], [[LOAD3]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr addrspace(13) [[SCEVGEP2]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 0, [[CALL]] +; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[SEXT]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[TMP6]], -4 +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP7]] +; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP5]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(13) [[SCEVGEP3]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt ptr addrspace(13) [[TMP9]], [[SCEVGEP3]] +; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 -4 +; CHECK-NEXT: [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP5]]) +; CHECK-NEXT: [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 0, [[MUL_RESULT6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(13) [[SCEVGEP4]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt ptr addrspace(13) [[TMP13]], [[SCEVGEP4]] +; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW7]] +; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP11]], [[TMP15]] +; CHECK-NEXT: [[LVER_SAFE:%.*]] = or i1 [[FOUND_CONFLICT]], [[TMP16]] +; CHECK-NEXT: br i1 [[LVER_SAFE]], label %[[LOOP_PH_LVER_ORIG:.*]], label %[[LOOP_PH:.*]] +; CHECK: [[LOOP_PH_LVER_ORIG]]: +; CHECK-NEXT: br label %[[LOOP_LVER_ORIG:.*]] +; CHECK: [[LOOP_LVER_ORIG]]: +; CHECK-NEXT: [[VALUE_PHI3_LVER_ORIG:%.*]] = phi i64 [ 0, %[[LOOP_PH_LVER_ORIG]] ], [ [[ADD0_LVER_ORIG:%.*]], %[[LOOP_LVER_ORIG]] ] +; CHECK-NEXT: [[ADD0_LVER_ORIG]] = add i64 [[VALUE_PHI3_LVER_ORIG]], -1 +; CHECK-NEXT: [[GEP0_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD0_LVER_ORIG]] +; CHECK-NEXT: [[LOAD4_LVER_ORIG:%.*]] = load i32, ptr addrspace(13) [[GEP0_LVER_ORIG]], align 4 +; CHECK-NEXT: [[ADD1_LVER_ORIG:%.*]] = add i64 [[ADD0_LVER_ORIG]], [[SEXT]] +; CHECK-NEXT: [[GEP1_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD1_LVER_ORIG]] +; CHECK-NEXT: store i32 [[LOAD4_LVER_ORIG]], ptr addrspace(13) [[GEP1_LVER_ORIG]], align 4 +; CHECK-NEXT: [[CMP_LVER_ORIG:%.*]] = icmp eq i64 [[VALUE_PHI3_LVER_ORIG]], [[CALL]] +; CHECK-NEXT: br i1 [[CMP_LVER_ORIG]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_LVER_ORIG]] +; CHECK: [[LOOP_PH]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[VALUE_PHI3:%.*]] = phi i64 [ 0, %[[LOOP_PH]] ], [ [[ADD0:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ADD0]] = add i64 [[VALUE_PHI3]], -1 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD0]] +; CHECK-NEXT: [[LOAD4:%.*]] = load i32, ptr addrspace(13) [[GEP0]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[ADD0]], [[SEXT]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD1]] +; CHECK-NEXT: store i32 [[LOAD4]], ptr addrspace(13) [[GEP1]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[VALUE_PHI3]], [[CALL]] +; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT_LOOPEXIT8:.*]], label %[[LOOP]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT_LOOPEXIT8]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; top: - %tmp = load ptr addrspace(10), ptr %arg, align 8 - %tmp1 = load i32, ptr inttoptr (i64 12 to ptr), align 4 - %tmp2 = sub i32 0, %tmp1 - %tmp3 = call i64 @julia_steprange_last_4949() - %tmp4 = addrspacecast ptr addrspace(10) %tmp to ptr addrspace(11) - %tmp6 = load ptr addrspace(10), ptr addrspace(11) %tmp4, align 8 - %tmp7 = addrspacecast ptr addrspace(10) %tmp6 to ptr addrspace(11) - %tmp9 = load ptr addrspace(13), ptr addrspace(11) %tmp7, align 8 - %tmp10 = sext i32 %tmp2 to i64 - br label %L26 + %load0 = load ptr addrspace(10), ptr %arg, align 8 + %load1 = load i32, ptr inttoptr (i64 12 to ptr), align 4 + %sub = sub i32 0, %load1 + %call = call i64 @julia_steprange_last_4949() + %cast0 = addrspacecast ptr addrspace(10) %load0 to ptr addrspace(11) + %load2 = load ptr addrspace(10), ptr addrspace(11) %cast0, align 8 + %cast1 = addrspacecast ptr addrspace(10) %load2 to ptr addrspace(11) + %load3 = load ptr addrspace(13), ptr addrspace(11) %cast1, align 8 + %sext = sext i32 %sub to i64 + br label %loop -L26: - %value_phi3 = phi i64 [ 0, %top ], [ %tmp11, %L26 ] - %tmp11 = add i64 %value_phi3, -1 - %tmp12 = getelementptr inbounds i32, ptr addrspace(13) %tmp9, i64 %tmp11 - %tmp13 = load i32, ptr addrspace(13) %tmp12, align 4 - %tmp14 = add i64 %tmp11, %tmp10 - %tmp15 = getelementptr inbounds i32, ptr addrspace(13) %tmp9, i64 %tmp14 - store i32 %tmp13, ptr addrspace(13) %tmp15, align 4 - %tmp16 = icmp eq i64 %value_phi3, %tmp3 - br i1 %tmp16, label %L45, label %L26 +loop: + %value_phi3 = phi i64 [ 0, %top ], [ %add0, %loop ] + %add0 = add i64 %value_phi3, -1 + %gep0 = getelementptr inbounds i32, ptr addrspace(13) %load3, i64 %add0 + %load4 = load i32, ptr addrspace(13) %gep0, align 4 + %add1 = add i64 %add0, %sext + %gep1 = getelementptr inbounds i32, ptr addrspace(13) %load3, i64 %add1 + store i32 %load4, ptr addrspace(13) %gep1, align 4 + %cmp = icmp eq i64 %value_phi3, %call + br i1 %cmp, label %exit, label %loop -L45: +exit: ret void } - +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]]} +; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} +;. diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll index db0656da579f4b590f2b6a1ccc6096a365bc4df3..ec46ef959ce69e37b09069c35325ef4d59739c53 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -10,13 +10,13 @@ define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, ptr nonnull al ; CHECK-NEXT: [[CONV1:%.*]] = zext i32 [[J:%.*]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = mul nuw nsw i64 [[CONV1]], 15 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 225 +; CHECK-NEXT: [[TMP2:%.*]] = icmp samesign ult i64 [[TMP1]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[TMP1]] ; CHECK-NEXT: [[MATRIXEXT:%.*]] = load double, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[I:%.*]] to i64 ; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV2]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 225 +; CHECK-NEXT: [[TMP5:%.*]] = icmp samesign ult i64 [[TMP4]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP5]]) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[TMP4]] ; CHECK-NEXT: [[MATRIXEXT4:%.*]] = load double, ptr [[TMP6]], align 8 @@ -94,7 +94,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[I]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[CONV6]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] @@ -151,7 +151,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY4_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP27:%.*]] = icmp ult i64 [[INDVARS_IV]], 225 +; CHECK-NEXT: [[TMP27:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP27]]) ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, ptr [[TMP28]], align 8 @@ -166,10 +166,10 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: ; CHECK-NEXT: [[TMP30:%.*]] = add nuw nsw i64 [[CONV6]], 15 -; CHECK-NEXT: [[TMP31:%.*]] = icmp ult i32 [[I]], 210 +; CHECK-NEXT: [[TMP31:%.*]] = icmp samesign ult i32 [[I]], 210 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP31]]) ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP30]] -; CHECK-NEXT: [[MIN_ITERS_CHECK_1:%.*]] = icmp ult i32 [[I]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK_1:%.*]] = icmp samesign ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_1]], label [[FOR_BODY4_US_PREHEADER_1:%.*]], label [[VECTOR_MEMCHECK_1:%.*]] ; CHECK: vector.memcheck.1: ; CHECK-NEXT: [[BOUND0_1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] @@ -228,7 +228,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK: for.body4.us.1: ; CHECK-NEXT: [[INDVARS_IV_1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ], [ [[INDVARS_IV_PH_1]], [[FOR_BODY4_US_PREHEADER_1]] ] ; CHECK-NEXT: [[TMP57:%.*]] = add nuw nsw i64 [[INDVARS_IV_1]], 15 -; CHECK-NEXT: [[TMP58:%.*]] = icmp ult i64 [[INDVARS_IV_1]], 210 +; CHECK-NEXT: [[TMP58:%.*]] = icmp samesign ult i64 [[INDVARS_IV_1]], 210 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP58]]) ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP57]] ; CHECK-NEXT: [[MATRIXEXT_US_1:%.*]] = load double, ptr [[TMP59]], align 8 @@ -243,10 +243,10 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: br i1 [[EXITCOND_NOT_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]], label [[FOR_BODY4_US_1]], !llvm.loop [[LOOP10]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.1: ; CHECK-NEXT: [[TMP61:%.*]] = add nuw nsw i64 [[CONV6]], 30 -; CHECK-NEXT: [[TMP62:%.*]] = icmp ult i32 [[I]], 195 +; CHECK-NEXT: [[TMP62:%.*]] = icmp samesign ult i32 [[I]], 195 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP62]]) ; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP61]] -; CHECK-NEXT: [[MIN_ITERS_CHECK_2:%.*]] = icmp ult i32 [[I]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK_2:%.*]] = icmp samesign ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_2]], label [[FOR_BODY4_US_PREHEADER_2:%.*]], label [[VECTOR_MEMCHECK_2:%.*]] ; CHECK: vector.memcheck.2: ; CHECK-NEXT: [[BOUND0_2:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] @@ -305,7 +305,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK: for.body4.us.2: ; CHECK-NEXT: [[INDVARS_IV_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ], [ [[INDVARS_IV_PH_2]], [[FOR_BODY4_US_PREHEADER_2]] ] ; CHECK-NEXT: [[TMP88:%.*]] = add nuw nsw i64 [[INDVARS_IV_2]], 30 -; CHECK-NEXT: [[TMP89:%.*]] = icmp ult i64 [[INDVARS_IV_2]], 195 +; CHECK-NEXT: [[TMP89:%.*]] = icmp samesign ult i64 [[INDVARS_IV_2]], 195 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP89]]) ; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP88]] ; CHECK-NEXT: [[MATRIXEXT_US_2:%.*]] = load double, ptr [[TMP90]], align 8 @@ -320,10 +320,10 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: br i1 [[EXITCOND_NOT_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]], label [[FOR_BODY4_US_2]], !llvm.loop [[LOOP10]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.2: ; CHECK-NEXT: [[TMP92:%.*]] = add nuw nsw i64 [[CONV6]], 45 -; CHECK-NEXT: [[TMP93:%.*]] = icmp ult i32 [[I]], 180 +; CHECK-NEXT: [[TMP93:%.*]] = icmp samesign ult i32 [[I]], 180 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP93]]) ; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP92]] -; CHECK-NEXT: [[MIN_ITERS_CHECK_3:%.*]] = icmp ult i32 [[I]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK_3:%.*]] = icmp samesign ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_3]], label [[FOR_BODY4_US_PREHEADER_3:%.*]], label [[VECTOR_MEMCHECK_3:%.*]] ; CHECK: vector.memcheck.3: ; CHECK-NEXT: [[BOUND0_3:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] @@ -382,7 +382,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK: for.body4.us.3: ; CHECK-NEXT: [[INDVARS_IV_3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ], [ [[INDVARS_IV_PH_3]], [[FOR_BODY4_US_PREHEADER_3]] ] ; CHECK-NEXT: [[TMP119:%.*]] = add nuw nsw i64 [[INDVARS_IV_3]], 45 -; CHECK-NEXT: [[TMP120:%.*]] = icmp ult i64 [[INDVARS_IV_3]], 180 +; CHECK-NEXT: [[TMP120:%.*]] = icmp samesign ult i64 [[INDVARS_IV_3]], 180 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP120]]) ; CHECK-NEXT: [[TMP121:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP119]] ; CHECK-NEXT: [[MATRIXEXT_US_3:%.*]] = load double, ptr [[TMP121]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll index 397c2571b3b66c2044b573387d41d4ae1b0db17d..eb5e279947ecb555f25f415ece5a4cdb1408d624 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll @@ -32,7 +32,7 @@ define i32 @ctlz_loop_with_abs(i32 %n) { ; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: [[TMP1]] = lshr i32 [[N_ADDR_03]], 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_02]], 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ult i32 [[N_ADDR_03]], 2 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp samesign ult i32 [[N_ADDR_03]], 2 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC]], [[WHILE_BODY]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll b/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll index 522ebf9dcc04bced1ee135ff12537dceabe98b1f..862f40a9ae2e9828d0f7236d6159d3c983441fe6 100644 --- a/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll +++ b/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll @@ -26,7 +26,7 @@ define void @test(ptr noundef %a, i32 noundef %beam) { ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_06]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_06]], 9999 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_06]], 9999 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll index 16610c439f4c04fc35509ef7e0940cfbb03c5e9b..fff94673a1a5190ed91b495521a2fa0ea9805523 100644 --- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll +++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll @@ -47,7 +47,7 @@ define void @f(ptr dead_on_unwind noalias %p) { ; CHECK-LABEL: define void @f( ; CHECK-SAME: ptr dead_on_unwind noalias [[P:%.*]]) local_unnamed_addr { ; CHECK-NEXT: store i64 3, ptr [[P]], align 4 -; CHECK-NEXT: tail call void @j(ptr nonnull [[P]]) +; CHECK-NEXT: tail call void @j(ptr nonnull align 8 dereferenceable(8) [[P]]) ; CHECK-NEXT: store i64 43, ptr [[P]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index c9ff2d6426d2b6375628a57b458fa6c4d1b29a47..72e29839230e81dc2e3884d4aaef2b15f54effe3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -318,14 +318,22 @@ entry: define float @f(ptr nocapture readonly %x) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f( ; THRESHOLD-NEXT: entry: -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 +; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: @@ -598,14 +606,18 @@ define float @loadadd31(ptr nocapture readonly %x) { ; CHECK-LABEL: @loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] @@ -615,14 +627,18 @@ define float @loadadd31(ptr nocapture readonly %x) { ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 +; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] ; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] diff --git a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll index 86bfbee63647881f12d453804492bdd016af2d69..1d7be43336c87958fab335a5eb0187f95cac14c5 100644 --- a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll +++ b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll @@ -4,6 +4,7 @@ ; This checks the default pass pipeline for the sandbox vectorizer. define void @pipeline() { +; CHECK: bottom-up-vec ; CHECK: rpm ; CHECK-EMPTY: ret void diff --git a/llvm/test/Transforms/SandboxVectorizer/regions-from-metadata.ll b/llvm/test/Transforms/SandboxVectorizer/regions-from-metadata.ll new file mode 100644 index 0000000000000000000000000000000000000000..3e57bde76e72daec220066ac67a1f11c683fdece --- /dev/null +++ b/llvm/test/Transforms/SandboxVectorizer/regions-from-metadata.ll @@ -0,0 +1,15 @@ +; RUN: opt -disable-output --passes=sandbox-vectorizer \ +; RUN: -sbvec-passes='regions-from-metadata' %s | FileCheck %s + +define i8 @foo(i8 %v0, i8 %v1) { + %t0 = add i8 %v0, 1, !sandboxvec !0 + %t1 = add i8 %t0, %v1, !sandboxvec !1 + %t2 = add i8 %t1, %v1, !sandboxvec !1 + ret i8 %t2 +} + +!0 = distinct !{!"sandboxregion"} +!1 = distinct !{!"sandboxregion"} + +; CHECK: InstructionCount: 1 +; CHECK: InstructionCount: 2 diff --git a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll index 2e6dab0aa29c74642397f44e33af49d66ffd24a3..b11b55ed960193567877c4f0154e36dc2e55704b 100644 --- a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll +++ b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll @@ -1,12 +1,28 @@ -; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline -sbvec-passes=null,null %s -disable-output | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline \ +; RUN: -disable-output -sbvec-passes="bottom-up-vec" %s \ +; RUN: | FileCheck %s +; +; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline \ +; RUN: -disable-output -sbvec-passes="bottom-up-vec<>,regions-from-metadata<>" %s \ +; RUN: | FileCheck --check-prefix CHECK-MULTIPLE-FUNCTION-PASSES %s ; !!!WARNING!!! This won't get updated by update_test_checks.py ! ; This checks the user defined pass pipeline. define void @pipeline() { + ret void +} + +; CHECK: fpm +; CHECK: bottom-up-vec ; CHECK: rpm ; CHECK: null ; CHECK: null ; CHECK-EMPTY: - ret void -} + +; CHECK-MULTIPLE-FUNCTION-PASSES: fpm +; CHECK-MULTIPLE-FUNCTION-PASSES: bottom-up-vec +; CHECK-MULTIPLE-FUNCTION-PASSES: rpm +; CHECK-MULTIPLE-FUNCTION-PASSES: regions-from-metadata +; CHECK-MULTIPLE-FUNCTION-PASSES: rpm +; CHECK-MULTIPLE-FUNCTION-PASSES-EMPTY: diff --git a/llvm/test/Verifier/range-attr.ll b/llvm/test/Verifier/range-attr.ll index f985ab696eacba5ffc68d06444ed5e858ab79e36..91412369b0b20dd272b1a524f9e92b1224e70fe7 100644 --- a/llvm/test/Verifier/range-attr.ll +++ b/llvm/test/Verifier/range-attr.ll @@ -1,12 +1,12 @@ ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s -; CHECK: Range bit width must match type bit width! +; CHECK: Attribute 'range(i8 1, 0)' applied to incompatible type! ; CHECK-NEXT: ptr @bit_widths_do_not_match define void @bit_widths_do_not_match(i32 range(i8 1, 0) %a) { ret void } -; CHECK: Range bit width must match type bit width! +; CHECK: Attribute 'range(i8 1, 0)' applied to incompatible type! ; CHECK-NEXT: ptr @bit_widths_do_not_match_vector define void @bit_widths_do_not_match_vector(<4 x i32> range(i8 1, 0) %a) { ret void diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll index bae66d456f89a53fcec5adc54553708ab6232547..174cca4fab0982454099eb0b075b6be98658e08a 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll @@ -60,4 +60,4 @@ define dso_local i32 @main() #0 { ret i32 0 } -attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } +attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="none" } diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected index de5571f64361543d39b4677dcad04e9c05e35adf..2dfb725f556655cf3b32ee3e1665a3e1b11043ae 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected @@ -61,7 +61,7 @@ define dso_local i32 @main() #0 { ret i32 0 } -attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } +attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="none" } ; CHECK-LABEL: check_boundaries: ; CHECK: @ %bb.0: ; CHECK-NEXT: sub sp, sp, #20 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected index 4f623384ade60234330585a343895d0f017b2a26..85d3389cdaaf92bf7f5443dfea137f5eb96f4d1f 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected @@ -121,4 +121,4 @@ define dso_local i32 @main() #0 { ret i32 0 } -attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } +attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="none" } diff --git a/llvm/test/tools/llvm-ar/Inputs/.gitattributes b/llvm/test/tools/llvm-ar/Inputs/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..6c8a26285daf7f6c16e4b1ba0e027ebe4d5f7190 --- /dev/null +++ b/llvm/test/tools/llvm-ar/Inputs/.gitattributes @@ -0,0 +1 @@ +mri-crlf.mri text eol=crlf diff --git a/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri b/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri index 72d23d041ae807cef3311a759a82f7b5ec89fc49..857c4ff87b6cf856f56c75cda2156dd7318ee0bf 100644 --- a/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri +++ b/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri @@ -1,4 +1,4 @@ -; this file intentionally has crlf line endings -create crlf.a -addmod foo.txt -end +; this file intentionally has crlf line endings +create crlf.a +addmod foo.txt +end diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32 b/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32 deleted file mode 100755 index 5a606d5a2f69fd150690bd17ebb9d922cc17c383..0000000000000000000000000000000000000000 Binary files a/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32 and /dev/null differ diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext b/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext deleted file mode 100644 index 20fc3816c2255a503ed59897ba1f54a7a5e3c5a0..0000000000000000000000000000000000000000 --- a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext +++ /dev/null @@ -1,4 +0,0 @@ -__main_argc_argv -0x0 -1 -100 diff --git a/llvm/test/tools/llvm-cov/binary-formats.c b/llvm/test/tools/llvm-cov/binary-formats.c index bb61b288cfc62e620768ff6df350100ad5622e5a..a5bfc012860ec3ae4f14756f0605f914c5b92ba6 100644 --- a/llvm/test/tools/llvm-cov/binary-formats.c +++ b/llvm/test/tools/llvm-cov/binary-formats.c @@ -10,11 +10,4 @@ int main(int argc, const char *argv[]) {} // RUN: llvm-cov show %S/Inputs/binary-formats.v3.macho64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s // RUN: llvm-cov show %S/Inputs/binary-formats.v6.linux64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s -// RUN: llvm-profdata merge %S/Inputs/binary-formats.wasm.proftext -o %t.wasm.profdata -// NOTE: The wasm binary is built with the following command: -// clang -target wasm32-unknown-wasi %s -o %S/Inputs/binary-formats.v6.wasm32 \ -// -mllvm -enable-name-compression=false \ -// -fprofile-instr-generate -fcoverage-mapping -lwasi-emulated-getpid -lwasi-emulated-mman -// RUN: llvm-cov show %S/Inputs/binary-formats.v6.wasm32 -instr-profile %t.wasm.profdata -path-equivalence=/tmp,%S %s | FileCheck %s - // RUN: llvm-cov export %S/Inputs/binary-formats.macho64l -instr-profile %t.profdata | FileCheck %S/Inputs/binary-formats.canonical.json diff --git a/llvm/test/tools/llvm-cvtres/Inputs/languages.rc b/llvm/test/tools/llvm-cvtres/Inputs/languages.rc index 081b3a77bebc10742778e5157a551e4817d3e3c0..82031d0e20839527d4a5350e758a7c03a2a3c040 100644 --- a/llvm/test/tools/llvm-cvtres/Inputs/languages.rc +++ b/llvm/test/tools/llvm-cvtres/Inputs/languages.rc @@ -1,36 +1,36 @@ -#include "windows.h" - -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US -randomdat RCDATA -{ - "this is a random bit of data that means nothing\0", - 0x23a9, - 0x140e, - 194292, -} - -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -randomdat RCDATA -{ - "zhe4 shi4 yi1ge4 sui2ji1 de shu4ju4, zhe4 yi4wei4zhe shen2me\0", - 0x23a9, - 0x140e, - 194292, -} - -LANGUAGE LANG_GERMAN, SUBLANG_GERMAN_LUXEMBOURG -randomdat RCDATA -{ - "Dies ist ein zufälliges Bit von Daten, die nichts bedeutet\0", - 0x23a9, - 0x140e, - 194292, -} - -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -myaccelerators ACCELERATORS -{ - "^C", 999, VIRTKEY, ALT - "D", 1100, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} +#include "windows.h" + +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US +randomdat RCDATA +{ + "this is a random bit of data that means nothing\0", + 0x23a9, + 0x140e, + 194292, +} + +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +randomdat RCDATA +{ + "zhe4 shi4 yi1ge4 sui2ji1 de shu4ju4, zhe4 yi4wei4zhe shen2me\0", + 0x23a9, + 0x140e, + 194292, +} + +LANGUAGE LANG_GERMAN, SUBLANG_GERMAN_LUXEMBOURG +randomdat RCDATA +{ + "Dies ist ein zufälliges Bit von Daten, die nichts bedeutet\0", + 0x23a9, + 0x140e, + 194292, +} + +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +myaccelerators ACCELERATORS +{ + "^C", 999, VIRTKEY, ALT + "D", 1100, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} diff --git a/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc b/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc index 5ca097baa0f736eb5f13cca8604a3286f1d0b263..494849f57a0a9e4ecb2572aaeadb8f30aaff3b40 100644 --- a/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc +++ b/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc @@ -1,50 +1,50 @@ -#include "windows.h" - -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US - -myaccelerators ACCELERATORS -{ - "^C", 999, VIRTKEY, ALT - "D", 1100, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -cursor BITMAP "cursor_small.bmp" -okay BITMAP "okay_small.bmp" - -14432 MENU -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -{ - MENUITEM "yu", 100 - MENUITEM "shala", 101 - MENUITEM "kaoya", 102 -} - -testdialog DIALOG 10, 10, 200, 300 -STYLE WS_POPUP | WS_BORDER -CAPTION "Test" -{ - CTEXT "Continue:", 1, 10, 10, 230, 14 - PUSHBUTTON "&OK", 2, 66, 134, 161, 13 -} - -12 ACCELERATORS -{ - "X", 164, VIRTKEY, ALT - "H", 5678, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -"eat" MENU -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS -{ - MENUITEM "fish", 100 - MENUITEM "salad", 101 - MENUITEM "duck", 102 -} - - -myresource stringarray { - "this is a user defined resource\0", - "it contains many strings\0", +#include "windows.h" + +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US + +myaccelerators ACCELERATORS +{ + "^C", 999, VIRTKEY, ALT + "D", 1100, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +cursor BITMAP "cursor_small.bmp" +okay BITMAP "okay_small.bmp" + +14432 MENU +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +{ + MENUITEM "yu", 100 + MENUITEM "shala", 101 + MENUITEM "kaoya", 102 +} + +testdialog DIALOG 10, 10, 200, 300 +STYLE WS_POPUP | WS_BORDER +CAPTION "Test" +{ + CTEXT "Continue:", 1, 10, 10, 230, 14 + PUSHBUTTON "&OK", 2, 66, 134, 161, 13 +} + +12 ACCELERATORS +{ + "X", 164, VIRTKEY, ALT + "H", 5678, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +"eat" MENU +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS +{ + MENUITEM "fish", 100 + MENUITEM "salad", 101 + MENUITEM "duck", 102 +} + + +myresource stringarray { + "this is a user defined resource\0", + "it contains many strings\0", } \ No newline at end of file diff --git a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc index bb79dca399c219ac4d1adb87ed7593318d60438e..c700b587af6483655c844ed387c08df0e92a3416 100644 --- a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc +++ b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc @@ -1,16 +1,16 @@ -101 DIALOG 0, 0, 362, 246 -STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l | - 0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l | - 0x00080000l | 0x00040000l -CAPTION "MakeNSISW" -MENU 104 -FONT 8, "MS Shell Dlg" -BEGIN - CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l | - 0x0100l | 0x0800l | 0x00008000 | - 0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190 - CONTROL "",-1,"Static",0x00000010l,7,220,346,1 - LTEXT "",200,7,230,200,12,0x08000000l - DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l - PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l -END +101 DIALOG 0, 0, 362, 246 +STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l | + 0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l | + 0x00080000l | 0x00040000l +CAPTION "MakeNSISW" +MENU 104 +FONT 8, "MS Shell Dlg" +BEGIN + CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l | + 0x0100l | 0x0800l | 0x00008000 | + 0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190 + CONTROL "",-1,"Static",0x00000010l,7,220,346,1 + LTEXT "",200,7,230,200,12,0x08000000l + DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l + PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l +END diff --git a/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc b/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc index fd616520dbe1b3199df977b4e4d48217e3de2638..6ad56bc02d73ca5a4548535a6eea30c3cf99e5d9 100644 --- a/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc +++ b/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc @@ -1,44 +1,44 @@ -#include "windows.h" - -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US - -myaccelerators ACCELERATORS -{ - "^C", 999, VIRTKEY, ALT - "D", 1100, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -cursor BITMAP "cursor_small.bmp" -okay BITMAP "okay_small.bmp" - -14432 MENU -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -{ - MENUITEM "yu", 100 - MENUITEM "shala", 101 - MENUITEM "kaoya", 102 -} - -testdialog DIALOG 10, 10, 200, 300 -STYLE WS_POPUP | WS_BORDER -CAPTION "Test" -{ - CTEXT "Continue:", 1, 10, 10, 230, 14 - PUSHBUTTON "&OK", 2, 66, 134, 161, 13 -} - -12 ACCELERATORS -{ - "X", 164, VIRTKEY, ALT - "H", 5678, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -"eat" MENU -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS -{ - MENUITEM "fish", 100 - MENUITEM "salad", 101 - MENUITEM "duck", 102 -} +#include "windows.h" + +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US + +myaccelerators ACCELERATORS +{ + "^C", 999, VIRTKEY, ALT + "D", 1100, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +cursor BITMAP "cursor_small.bmp" +okay BITMAP "okay_small.bmp" + +14432 MENU +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +{ + MENUITEM "yu", 100 + MENUITEM "shala", 101 + MENUITEM "kaoya", 102 +} + +testdialog DIALOG 10, 10, 200, 300 +STYLE WS_POPUP | WS_BORDER +CAPTION "Test" +{ + CTEXT "Continue:", 1, 10, 10, 230, 14 + PUSHBUTTON "&OK", 2, 66, 134, 161, 13 +} + +12 ACCELERATORS +{ + "X", 164, VIRTKEY, ALT + "H", 5678, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +"eat" MENU +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS +{ + MENUITEM "fish", 100 + MENUITEM "salad", 101 + MENUITEM "duck", 102 +} diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 108cadd2e0169cdf935e07e1677cdf3af8483bcb..d90176818c8e6146dfaff2ccf4fedf7b640f1c92 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -30,6 +30,7 @@ #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h" #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" #include "llvm/ExecutionEngine/Orc/LoadLinkableFile.h" +#include "llvm/ExecutionEngine/Orc/MachO.h" #include "llvm/ExecutionEngine/Orc/MachOPlatform.h" #include "llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h" #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" @@ -265,6 +266,17 @@ static cl::opt OverrideTriple("triple", cl::desc("Override target triple detection"), cl::init(""), cl::cat(JITLinkCategory)); +static cl::opt AllLoad("all_load", + cl::desc("Load all members of static archives"), + cl::init(false), cl::cat(JITLinkCategory)); + +static cl::opt ForceLoadObjC( + "ObjC", + cl::desc("Load all members of static archives that implement " + "Objective-C classes or categories, or Swift structs, " + "classes or extensions"), + cl::init(false), cl::cat(JITLinkCategory)); + static ExitOnError ExitOnErr; static LLVM_ATTRIBUTE_USED void linkComponents() { @@ -1057,9 +1069,9 @@ Session::Session(std::unique_ptr EPC, Error &Err) return loadAndLinkDynamicLibrary(JD, DLLName); }; - if (auto P = COFFPlatform::Create(ES, ObjLayer, *PlatformJD, - OrcRuntime.c_str(), - std::move(LoadDynLibrary))) + if (auto P = + COFFPlatform::Create(ObjLayer, *PlatformJD, OrcRuntime.c_str(), + std::move(LoadDynLibrary))) ES.setPlatform(std::move(*P)); else { Err = P.takeError(); @@ -1957,10 +1969,9 @@ static Error addLibraries(Session &S, }); // 3. Process library loads. - auto AddArchive = [&](const char *Path, const LibraryLoad &LL) + auto AddArchive = [&](JITDylib &JD, const char *Path, const LibraryLoad &LL) -> Expected> { - unique_function( - ExecutionSession & ES, MemoryBufferRef ObjBuffer)> + StaticLibraryDefinitionGenerator::GetObjectFileInterface GetObjFileInterface; switch (LL.Modifier) { case LibraryLoad::Standard: @@ -1970,8 +1981,17 @@ static Error addLibraries(Session &S, GetObjFileInterface = getObjectFileInterfaceHidden; break; } + + StaticLibraryDefinitionGenerator::VisitMembersFunction VisitMembers; + if (AllLoad) + VisitMembers = StaticLibraryDefinitionGenerator::loadAllObjectFileMembers( + S.ObjLayer, JD); + else if (S.ES.getTargetTriple().isOSBinFormatMachO() && ForceLoadObjC) + VisitMembers = ForceLoadMachOArchiveMembers(S.ObjLayer, JD, true); + auto G = StaticLibraryDefinitionGenerator::Load( - S.ObjLayer, Path, std::move(GetObjFileInterface)); + S.ObjLayer, Path, std::move(VisitMembers), + std::move(GetObjFileInterface)); if (!G) return G.takeError(); @@ -2009,7 +2029,7 @@ static Error addLibraries(Session &S, } if (LL.IsPath) { - auto G = AddArchive(LL.LibName.c_str(), LL); + auto G = AddArchive(JD, LL.LibName.c_str(), LL); if (!G) return createFileError(LL.LibName, G.takeError()); JD.addGenerator(std::move(*G)); @@ -2065,7 +2085,7 @@ static Error addLibraries(Session &S, } case file_magic::archive: case file_magic::macho_universal_binary: { - auto G = AddArchive(LibPath.data(), LL); + auto G = AddArchive(JD, LibPath.data(), LL); if (!G) return G.takeError(); JD.addGenerator(std::move(*G)); diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp index f5076f0b975178a01561c2c0d837417cf0ec59ea..4090faf4e3fd95bce21ccafeaf04425d55e91c08 100644 --- a/llvm/tools/llvm-lto/llvm-lto.cpp +++ b/llvm/tools/llvm-lto/llvm-lto.cpp @@ -407,6 +407,64 @@ static void printIndexStats() { } } +/// Print the lto symbol attributes. +static void printLTOSymbolAttributes(lto_symbol_attributes Attrs) { + outs() << "{ "; + unsigned Permission = Attrs & LTO_SYMBOL_PERMISSIONS_MASK; + switch (Permission) { + case LTO_SYMBOL_PERMISSIONS_CODE: + outs() << "function "; + break; + case LTO_SYMBOL_PERMISSIONS_DATA: + outs() << "data "; + break; + case LTO_SYMBOL_PERMISSIONS_RODATA: + outs() << "constant "; + break; + } + unsigned Definition = Attrs & LTO_SYMBOL_DEFINITION_MASK; + switch (Definition) { + case LTO_SYMBOL_DEFINITION_REGULAR: + outs() << "defined "; + break; + case LTO_SYMBOL_DEFINITION_TENTATIVE: + outs() << "common "; + break; + case LTO_SYMBOL_DEFINITION_WEAK: + outs() << "weak "; + break; + case LTO_SYMBOL_DEFINITION_UNDEFINED: + outs() << "extern "; + break; + case LTO_SYMBOL_DEFINITION_WEAKUNDEF: + outs() << "extern-weak "; + break; + } + unsigned Scope = Attrs & LTO_SYMBOL_SCOPE_MASK; + switch (Scope) { + case LTO_SYMBOL_SCOPE_INTERNAL: + outs() << "internal "; + break; + case LTO_SYMBOL_SCOPE_HIDDEN: + outs() << "hidden "; + break; + case LTO_SYMBOL_SCOPE_PROTECTED: + outs() << "protected "; + break; + case LTO_SYMBOL_SCOPE_DEFAULT: + outs() << "default "; + break; + case LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN: + outs() << "omitted "; + break; + } + if (Attrs & LTO_SYMBOL_COMDAT) + outs() << "comdat "; + if (Attrs & LTO_SYMBOL_ALIAS) + outs() << "alias "; + outs() << "}"; +} + /// Load each IR file and dump certain information based on active flags. /// /// The main point here is to provide lit-testable coverage for the LTOModule @@ -421,8 +479,11 @@ static void testLTOModule(const TargetOptions &Options) { if (ListSymbolsOnly) { // List the symbols. outs() << Filename << ":\n"; - for (int I = 0, E = Module->getSymbolCount(); I != E; ++I) - outs() << Module->getSymbolName(I) << "\n"; + for (int I = 0, E = Module->getSymbolCount(); I != E; ++I) { + outs() << Module->getSymbolName(I) << " "; + printLTOSymbolAttributes(Module->getSymbolAttributes(I)); + outs() << "\n"; + } } if (QueryHasCtorDtor) outs() << Filename diff --git a/llvm/unittests/ADT/APFixedPointTest.cpp b/llvm/unittests/ADT/APFixedPointTest.cpp index e7aa58a8325773b35fa47e4644d5d144a0284b0c..b71c5e16a915ee1e62ddb8f4012ebec10a737345 100644 --- a/llvm/unittests/ADT/APFixedPointTest.cpp +++ b/llvm/unittests/ADT/APFixedPointTest.cpp @@ -240,19 +240,20 @@ void CheckIntPart(const FixedPointSemantics &Sema, int64_t IntPart) { APFixedPoint ValWithFract( APInt(Sema.getWidth(), relativeShr(IntPart, Sema.getLsbWeight()) + FullFactPart, - Sema.isSigned()), + Sema.isSigned(), /*implicitTrunc=*/true), Sema); ASSERT_EQ(ValWithFract.getIntPart(), IntPart); // Just fraction - APFixedPoint JustFract(APInt(Sema.getWidth(), FullFactPart, Sema.isSigned()), + APFixedPoint JustFract(APInt(Sema.getWidth(), FullFactPart, Sema.isSigned(), + /*implicitTrunc=*/true), Sema); ASSERT_EQ(JustFract.getIntPart(), 0); // Whole number APFixedPoint WholeNum(APInt(Sema.getWidth(), relativeShr(IntPart, Sema.getLsbWeight()), - Sema.isSigned()), + Sema.isSigned(), /*implicitTrunc=*/true), Sema); ASSERT_EQ(WholeNum.getIntPart(), IntPart); @@ -260,7 +261,7 @@ void CheckIntPart(const FixedPointSemantics &Sema, int64_t IntPart) { if (Sema.isSigned()) { APFixedPoint Negative(APInt(Sema.getWidth(), relativeShr(IntPart, Sema.getLsbWeight()), - Sema.isSigned()), + Sema.isSigned(), /*implicitTrunc=*/true), Sema); ASSERT_EQ(Negative.getIntPart(), IntPart); } diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp index 1cd4a47c75739b15fd19ee0a484a74df526968a1..e7fb4e0718648bd073335c037cd97dc837d7ec97 100644 --- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp +++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp @@ -23,6 +23,8 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) { "e-m:o-i64:64-f80:128-n8:16:32:64-S128", "x86_64-apple-macosx"); std::string DL4 = UpgradeDataLayoutString("e-m:o-i64:64-i128:128-n32:64-S128", "aarch64--"); + std::string DL5 = UpgradeDataLayoutString( + "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", "aarch64--"); EXPECT_EQ(DL1, "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128" "-f80:128-n8:16:32:64-S128"); @@ -31,7 +33,10 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) { "-f80:128-n8:16:32-S32"); EXPECT_EQ(DL3, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:" "128-n8:16:32:64-S128"); - EXPECT_EQ(DL4, "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"); + EXPECT_EQ(DL4, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:" + "64-S128-Fn32"); + EXPECT_EQ(DL5, "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:" + "64-i128:128-n32:64-S128-Fn32"); // Check that AMDGPU targets add -G1 if it's not present. EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1"); @@ -94,14 +99,16 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) { std::string DL2 = UpgradeDataLayoutString("e-m:e-i64:64-n32:64", "powerpc64le-unknown-linux-gnu"); std::string DL3 = UpgradeDataLayoutString( - "e-m:o-i64:64-i128:128-n32:64-S128-Fn32", "aarch64--"); + "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32", + "aarch64--"); EXPECT_EQ( DL1, "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128" "-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" "-f80:128:128-n8:16:32:64-S128"); EXPECT_EQ(DL2, "e-m:e-i64:64-n32:64"); - EXPECT_EQ(DL3, "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"); + EXPECT_EQ(DL3, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:" + "64-S128-Fn32"); // Check that AMDGPU targets don't add -G1 if there is already a -G flag. EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2"); diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 5d5720c3162da96e6cb6a422dcd649efddb5a7e7..749c5780fbac3dd8fef38f012797c9c68670b235 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -14,7 +14,9 @@ public: MachineBasicBlock &MBB) const override {} void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override {} - bool hasFP(const MachineFunction &MF) const override { return false; } + +protected: + bool hasFPImpl(const MachineFunction &MF) const override { return false; } }; static TargetRegisterClass *const BogusRegisterClasses[] = {nullptr}; diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index 7400b6c1984f720d821b7ae3c32400ec116df5db..dc40e5893b65e2329f86e1f8d0122a96c0f285cb 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -200,6 +200,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) { SDValue SMin = DAG->getNode(ISD::SMIN, DL, Int32VT, Op1, Op0); SDValue UMax = DAG->getNode(ISD::UMAX, DL, Int32VT, Op0, Op1); SDValue UMin = DAG->getNode(ISD::UMIN, DL, Int32VT, Op1, Op0); + SDValue Rotl = DAG->getNode(ISD::ROTL, DL, Int32VT, Op0, Op1); + SDValue Rotr = DAG->getNode(ISD::ROTR, DL, Int32VT, Op1, Op0); SDValue ICMP_GT = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGT); SDValue ICMP_GE = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGE); @@ -246,6 +248,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) { EXPECT_FALSE(sd_match(DisOr, m_Add(m_Value(), m_Value()))); EXPECT_TRUE(sd_match(DisOr, m_AddLike(m_Value(), m_Value()))); + EXPECT_TRUE(sd_match(Rotl, m_Rotl(m_Value(), m_Value()))); + EXPECT_TRUE(sd_match(Rotr, m_Rotr(m_Value(), m_Value()))); + EXPECT_FALSE(sd_match(Rotl, m_Rotr(m_Value(), m_Value()))); + EXPECT_FALSE(sd_match(Rotr, m_Rotl(m_Value(), m_Value()))); + EXPECT_TRUE(sd_match(SMax, m_c_BinOp(ISD::SMAX, m_Value(), m_Value()))); EXPECT_TRUE(sd_match(SMax, m_SMax(m_Value(), m_Value()))); EXPECT_TRUE(sd_match(SMax, m_SMaxLike(m_Value(), m_Value()))); @@ -302,7 +309,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2); SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2); + SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0); + SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0); + + SDValue Ctpop = DAG->getNode(ISD::CTPOP, DL, Int32VT, Op0); SDValue Ctlz = DAG->getNode(ISD::CTLZ, DL, Int32VT, Op0); + SDValue Cttz = DAG->getNode(ISD::CTTZ, DL, Int32VT, Op0); using namespace SDPatternMatch; EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value()))); @@ -328,7 +340,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value()))); EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value()))); + EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value()))); + EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value()))); + EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value()))); + EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value()))); + + EXPECT_TRUE(sd_match(Ctpop, m_Ctpop(m_Value()))); EXPECT_TRUE(sd_match(Ctlz, m_Ctlz(m_Value()))); + EXPECT_TRUE(sd_match(Cttz, m_Cttz(m_Value()))); + EXPECT_FALSE(sd_match(Ctpop, m_Ctlz(m_Value()))); + EXPECT_FALSE(sd_match(Ctlz, m_Cttz(m_Value()))); + EXPECT_FALSE(sd_match(Cttz, m_Ctlz(m_Value()))); } TEST_F(SelectionDAGPatternMatchTest, matchConstants) { diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp index 7dc4b9f448d386f6915bdb860df179399bed28f7..367ba6ab52a596569811fce020d3040a2a91d8f9 100644 --- a/llvm/unittests/IR/PatternMatch.cpp +++ b/llvm/unittests/IR/PatternMatch.cpp @@ -1040,6 +1040,140 @@ TEST_F(PatternMatchTest, FloatingPointUnorderedMax) { EXPECT_EQ(R, MatchR); } +TEST_F(PatternMatchTest, FloatingPointMin) { + Type *FltTy = IRB.getFloatTy(); + Value *L = ConstantFP::get(FltTy, 1.0); + Value *R = ConstantFP::get(FltTy, 2.0); + Value *MatchL, *MatchR; + + // Test OLT. + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOLT(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test OLE. + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOLE(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test ULT. + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpULT(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test ULE. + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpULE(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test no match on OGE. + EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOGE(L, R), L, R))); + + // Test no match on OGT. + EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOGT(L, R), L, R))); + + // Test no match on UGE. + EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpUGE(L, R), L, R))); + + // Test no match on UGT. + EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpUGT(L, R), L, R))); + + // Test inverted selects. Note, that this "inverts" the ordering, e.g.: + // %cmp = fcmp oge L, R + // %min = select %cmp R, L + + // [OU]GE with inverted select. + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOGE(L, R), R, L))); + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpUGE(L, R), R, L))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // [OU]GT with inverted select. + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOGT(L, R), R, L))); + EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpUGT(L, R), R, L))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); +} + +TEST_F(PatternMatchTest, FloatingPointMax) { + Type *FltTy = IRB.getFloatTy(); + Value *L = ConstantFP::get(FltTy, 1.0); + Value *R = ConstantFP::get(FltTy, 2.0); + Value *MatchL, *MatchR; + + // Test OGT. + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOGT(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test OGE. + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOGE(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test UGT. + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpUGT(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test UGE. + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpUGE(L, R), L, R))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // Test no match on OLE. + EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOLE(L, R), L, R))); + + // Test no match on OLT. + EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOLT(L, R), L, R))); + + // Test no match on ULE. + EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpULE(L, R), L, R))); + + // Test no match on ULT. + EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpULT(L, R), L, R))); + + // Test inverted selects. Note, that this "inverts" the ordering, e.g.: + // %cmp = fcmp ole L, R + // %max = select %cmp, R, L + + // [OU]LE with inverted select. + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOLE(L, R), R, L))); + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpULE(L, R), R, L))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); + + // [OUT]LT with inverted select. + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpOLT(L, R), R, L))); + EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR)) + .match(IRB.CreateSelect(IRB.CreateFCmpULT(L, R), R, L))); + EXPECT_EQ(L, MatchL); + EXPECT_EQ(R, MatchR); +} + TEST_F(PatternMatchTest, OverflowingBinOps) { Value *L = IRB.getInt32(1); Value *R = IRB.getInt32(2); diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp index ae7284ecf2deb9a857282b0cc3b67bd7c83eb74c..866bd8233d8035d1f4626432c904b197a46c0ac5 100644 --- a/llvm/unittests/SandboxIR/PassTest.cpp +++ b/llvm/unittests/SandboxIR/PassTest.cpp @@ -265,39 +265,45 @@ define void @f() { "f"); class FooPass final : public FunctionPass { std::string &Str; + std::string Args; public: - FooPass(std::string &Str) : FunctionPass("foo-pass"), Str(Str) {} + FooPass(std::string &Str, llvm::StringRef Args) + : FunctionPass("foo-pass"), Str(Str), Args(Args.str()) {} bool runOnFunction(Function &F) final { - Str += "foo"; + Str += "foo<" + Args + ">"; return false; } }; class BarPass final : public FunctionPass { std::string &Str; + std::string Args; public: - BarPass(std::string &Str) : FunctionPass("bar-pass"), Str(Str) {} + BarPass(std::string &Str, llvm::StringRef Args) + : FunctionPass("bar-pass"), Str(Str), Args(Args.str()) {} bool runOnFunction(Function &F) final { - Str += "bar"; + Str += "bar<" + Args + ">"; return false; } }; std::string Str; auto CreatePass = - [&Str](llvm::StringRef Name) -> std::unique_ptr { + [&Str](llvm::StringRef Name, + llvm::StringRef Args) -> std::unique_ptr { if (Name == "foo") - return std::make_unique(Str); + return std::make_unique(Str, Args); if (Name == "bar") - return std::make_unique(Str); + return std::make_unique(Str, Args); return nullptr; }; FunctionPassManager FPM("test-fpm"); - FPM.setPassPipeline("foo,bar,foo", CreatePass); + FPM.setPassPipeline("foo,bar>>,foo", + CreatePass); FPM.runOnFunction(*F); - EXPECT_EQ(Str, "foobarfoo"); + EXPECT_EQ(Str, "foobar>>foo<>"); // A second call to setPassPipeline will trigger an assertion in debug mode. #ifndef NDEBUG @@ -308,8 +314,32 @@ define void @f() { // Fresh PM for the death tests so they die from bad pipeline strings, rather // than from multiple setPassPipeline calls. FunctionPassManager FPM2("test-fpm"); + // Bad/empty pass names. EXPECT_DEATH(FPM2.setPassPipeline("bad-pass-name", CreatePass), ".*not registered.*"); - EXPECT_DEATH(FPM2.setPassPipeline("", CreatePass), ".*not registered.*"); - EXPECT_DEATH(FPM2.setPassPipeline(",", CreatePass), ".*not registered.*"); + EXPECT_DEATH(FPM2.setPassPipeline(",", CreatePass), ".*empty pass name.*"); + EXPECT_DEATH(FPM2.setPassPipeline("<>", CreatePass), ".*empty pass name.*"); + EXPECT_DEATH(FPM2.setPassPipeline("<>foo", CreatePass), + ".*empty pass name.*"); + EXPECT_DEATH(FPM2.setPassPipeline("foo,<>", CreatePass), + ".*empty pass name.*"); + + // Mismatched argument brackets. + EXPECT_DEATH(FPM2.setPassPipeline("foo<", CreatePass), ".*Missing '>'.*"); + EXPECT_DEATH(FPM2.setPassPipeline("foo'.*"); + EXPECT_DEATH(FPM2.setPassPipeline("foo", CreatePass), + ".*Missing '>'.*"); + EXPECT_DEATH(FPM2.setPassPipeline("foo>", CreatePass), ".*Unexpected '>'.*"); + EXPECT_DEATH(FPM2.setPassPipeline(">foo", CreatePass), ".*Unexpected '>'.*"); + // Extra garbage between args and next delimiter/end-of-string. + EXPECT_DEATH(FPM2.setPassPipeline("foo>>", CreatePass), + ".*Expected delimiter.*"); + EXPECT_DEATH(FPM2.setPassPipeline("bar<>foo", CreatePass), + ".*Expected delimiter.*"); + EXPECT_DEATH(FPM2.setPassPipeline("bar<>foo,baz", CreatePass), + ".*Expected delimiter.*"); + EXPECT_DEATH(FPM2.setPassPipeline("foo", CreatePass), + ".*Expected delimiter.*"); + EXPECT_DEATH(FPM2.setPassPipeline("foobar", CreatePass), + ".*Expected delimiter.*"); } diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp index e745f99a5a6c4eec7f450a9ac245b9c21ecc8037..03102c96d46629caab51629496e7fb9977125d13 100644 --- a/llvm/unittests/Support/FormatVariadicTest.cpp +++ b/llvm/unittests/Support/FormatVariadicTest.cpp @@ -150,7 +150,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) { EXPECT_EQ(0u, Replacements[0].Index); EXPECT_EQ(3u, Replacements[0].Width); EXPECT_EQ(AlignStyle::Left, Replacements[0].Where); - EXPECT_EQ("foo", Replacements[0].Options); + EXPECT_EQ(" foo ", Replacements[0].Options); // 8. Everything after the first option specifier is part of the style, even // if it contains another option specifier. diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp index 35107e50b32db78545dc78b8781b1884cb9114dd..f77e7e39e14eab3384aa4cab554463eee603b73e 100644 --- a/llvm/unittests/Support/ModRefTest.cpp +++ b/llvm/unittests/Support/ModRefTest.cpp @@ -1,27 +1,27 @@ -//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/ModRef.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/Support/raw_ostream.h" -#include "gtest/gtest.h" -#include - -using namespace llvm; - -namespace { - -// Verify that printing a MemoryEffects does not end with a ,. -TEST(ModRefTest, PrintMemoryEffects) { - std::string S; - raw_string_ostream OS(S); - OS << MemoryEffects::none(); - EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef"); -} - -} // namespace +//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ModRef.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/raw_ostream.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; + +namespace { + +// Verify that printing a MemoryEffects does not end with a ,. +TEST(ModRefTest, PrintMemoryEffects) { + std::string S; + raw_string_ostream OS(S); + OS << MemoryEffects::none(); + EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef"); +} + +} // namespace diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 87b78d502780d1fd420955f0f5376034d84b2d45..369e534634801424ebf20e366cde440bc38ca850 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1326,8 +1326,12 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { AArch64::AEK_CPA, AArch64::AEK_PAUTHLR, AArch64::AEK_TLBIW, AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_FP8, - AArch64::AEK_SVEB16B16, - }; + AArch64::AEK_SVEB16B16, AArch64::AEK_SVE2P2, + AArch64::AEK_SME2P2, AArch64::AEK_SVE_BFSCALE, + AArch64::AEK_SVE_F16F32MM, AArch64::AEK_SVE_AES2, + AArch64::AEK_SSVE_AES, AArch64::AEK_F8F32MM, + AArch64::AEK_F8F16MM, AArch64::AEK_LSFE, + AArch64::AEK_FPRCVT, AArch64::AEK_CMPBR}; std::vector Features; @@ -1359,12 +1363,17 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { EXPECT_TRUE(llvm::is_contained(Features, "+ras")); EXPECT_TRUE(llvm::is_contained(Features, "+sve")); EXPECT_TRUE(llvm::is_contained(Features, "+sve-b16b16")); + EXPECT_TRUE(llvm::is_contained(Features, "+sve-bfscale")); + EXPECT_TRUE(llvm::is_contained(Features, "+sve-f16f32mm")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sm4")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sha3")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-bitperm")); + EXPECT_TRUE(llvm::is_contained(Features, "+sve-aes2")); + EXPECT_TRUE(llvm::is_contained(Features, "+ssve-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2p1")); + EXPECT_TRUE(llvm::is_contained(Features, "+sve2p2")); EXPECT_TRUE(llvm::is_contained(Features, "+rcpc")); EXPECT_TRUE(llvm::is_contained(Features, "+rand")); EXPECT_TRUE(llvm::is_contained(Features, "+mte")); @@ -1387,6 +1396,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { EXPECT_TRUE(llvm::is_contained(Features, "+sme2")); EXPECT_TRUE(llvm::is_contained(Features, "+sme-b16b16")); EXPECT_TRUE(llvm::is_contained(Features, "+sme2p1")); + EXPECT_TRUE(llvm::is_contained(Features, "+sme2p2")); EXPECT_TRUE(llvm::is_contained(Features, "+hbc")); EXPECT_TRUE(llvm::is_contained(Features, "+mops")); EXPECT_TRUE(llvm::is_contained(Features, "+perfmon")); @@ -1406,6 +1416,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8dot2")); EXPECT_TRUE(llvm::is_contained(Features, "+fp8dot4")); EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8dot4")); + EXPECT_TRUE(llvm::is_contained(Features, "+f8f32mm")); + EXPECT_TRUE(llvm::is_contained(Features, "+f8f16mm")); EXPECT_TRUE(llvm::is_contained(Features, "+lut")); EXPECT_TRUE(llvm::is_contained(Features, "+sme-lutv2")); EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f16")); @@ -1416,6 +1428,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { EXPECT_TRUE(llvm::is_contained(Features, "+tlbiw")); EXPECT_TRUE(llvm::is_contained(Features, "+jsconv")); EXPECT_TRUE(llvm::is_contained(Features, "+complxnum")); + EXPECT_TRUE(llvm::is_contained(Features, "+lsfe")); + EXPECT_TRUE(llvm::is_contained(Features, "+fprcvt")); + EXPECT_TRUE(llvm::is_contained(Features, "+cmpbr")); // Assuming we listed every extension above, this should produce the same // result. @@ -1513,12 +1528,17 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"rdm", "nordm", "+rdm", "-rdm"}, {"sve", "nosve", "+sve", "-sve"}, {"sve-b16b16", "nosve-b16b16", "+sve-b16b16", "-sve-b16b16"}, + {"sve-bfscale", "nosve-bfscale", "+sve-bfscale", "-sve-bfscale"}, + {"sve-f16f32mm", "nosve-f16f32mm", "+sve-f16f32mm", "-sve-f16f32mm"}, {"sve2", "nosve2", "+sve2", "-sve2"}, {"sve2-aes", "nosve2-aes", "+sve2-aes", "-sve2-aes"}, {"sve2-sm4", "nosve2-sm4", "+sve2-sm4", "-sve2-sm4"}, {"sve2-sha3", "nosve2-sha3", "+sve2-sha3", "-sve2-sha3"}, {"sve2p1", "nosve2p1", "+sve2p1", "-sve2p1"}, + {"sve2p2", "nosve2p2", "+sve2p2", "-sve2p2"}, {"sve2-bitperm", "nosve2-bitperm", "+sve2-bitperm", "-sve2-bitperm"}, + {"sve-aes2", "nosve-aes2", "+sve-aes2", "-sve-aes2"}, + {"ssve-aes", "nossve-aes", "+ssve-aes", "-ssve-aes"}, {"dotprod", "nodotprod", "+dotprod", "-dotprod"}, {"rcpc", "norcpc", "+rcpc", "-rcpc"}, {"rng", "norng", "+rand", "-rand"}, @@ -1531,6 +1551,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"i8mm", "noi8mm", "+i8mm", "-i8mm"}, {"f32mm", "nof32mm", "+f32mm", "-f32mm"}, {"f64mm", "nof64mm", "+f64mm", "-f64mm"}, + {"f8f32mm", "nof8f32mm", "+f8f32mm", "-f8f32mm"}, + {"f8f16mm", "nof8f16mm", "+f8f16mm", "-f8f16mm"}, {"sme", "nosme", "+sme", "-sme"}, {"sme-fa64", "nosme-fa64", "+sme-fa64", "-sme-fa64"}, {"sme-f64f64", "nosme-f64f64", "+sme-f64f64", "-sme-f64f64"}, @@ -1539,6 +1561,7 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"sme2", "nosme2", "+sme2", "-sme2"}, {"sme-b16b16", "nosme-b16b16", "+sme-b16b16", "-sme-b16b16"}, {"sme2p1", "nosme2p1", "+sme2p1", "-sme2p1"}, + {"sme2p2", "nosme2p2", "+sme2p2", "-sme2p2"}, {"hbc", "nohbc", "+hbc", "-hbc"}, {"mops", "nomops", "+mops", "-mops"}, {"pmuv3", "nopmuv3", "+perfmon", "-perfmon"}, @@ -1557,7 +1580,9 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"sme-lutv2", "nosme-lutv2", "+sme-lutv2", "-sme-lutv2"}, {"sme-f8f16", "nosme-f8f16", "+sme-f8f16", "-sme-f8f16"}, {"sme-f8f32", "nosme-f8f32", "+sme-f8f32", "-sme-f8f32"}, - }; + {"lsfe", "nolsfe", "+lsfe", "-lsfe"}, + {"fprcvt", "nofprcvt", "+fprcvt", "-fprcvt"}, + {"cmpbr", "nocmpbr", "+cmpbr", "-cmpbr"}}; for (unsigned i = 0; i < std::size(ArchExt); i++) { EXPECT_EQ(StringRef(ArchExt[i][2]), @@ -1783,7 +1808,7 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV8A, {"nosve", "f64mm"}, {"sve", "f64mm"}, {}}, {AArch64::ARMV8A, {"f64mm", "nosve"}, {}, {"sve", "f64mm"}}, - // sve2 -> {sve2p1, sve2-bitperm, sve2-aes, sve2-sha3, sve2-sm4} + // sve2 -> {sve2p1, sve2-bitperm, sve2-sha3, sve2-sm4} {AArch64::ARMV8A, {"nosve2", "sve2p1"}, {"sve2", "sve2p1"}, {}}, {AArch64::ARMV8A, {"sve2p1", "nosve2"}, {}, {"sve2", "sve2p1"}}, {AArch64::ARMV8A, @@ -1794,23 +1819,25 @@ AArch64ExtensionDependenciesBaseArchTestParams {"sve2-bitperm", "nosve2"}, {}, {"sve2", "sve2-bitperm"}}, - {AArch64::ARMV8A, {"nosve2", "sve2-aes"}, {"sve2", "sve2-aes"}, {}}, - {AArch64::ARMV8A, {"sve2-aes", "nosve2"}, {}, {"sve2", "sve2-aes"}}, {AArch64::ARMV8A, {"nosve2", "sve2-sha3"}, {"sve2", "sve2-sha3"}, {}}, {AArch64::ARMV8A, {"sve2-sha3", "nosve2"}, {}, {"sve2", "sve2-sha3"}}, {AArch64::ARMV8A, {"nosve2", "sve2-sm4"}, {"sve2", "sve2-sm4"}, {}}, {AArch64::ARMV8A, {"sve2-sm4", "nosve2"}, {}, {"sve2", "sve2-sm4"}}, // sve-b16b16 -> {sme-b16b16} - {AArch64::ARMV8A, + {AArch64::ARMV9_4A, {"nosve-b16b16", "sme-b16b16"}, {"sve-b16b16", "sme-b16b16"}, {}}, - {AArch64::ARMV8A, + {AArch64::ARMV9_4A, {"sme-b16b16", "nosve-b16b16"}, {}, {"sve-b16b16", "sme-b16b16"}}, + // sve2p1 -> {sve2p2} + {AArch64::ARMV9_6A, {"nosve2p1", "sve2p2"}, {"sve2p1", "sve2p2"}, {}}, + {AArch64::ARMV9_6A, {"sve2p2", "nosve2p1"}, {}, {"sve2p1", "sve2p2"}}, + // sme -> {sme2, sme-f16f16, sme-f64f64, sme-i16i64, sme-fa64} {AArch64::ARMV8A, {"nosme", "sme2"}, {"sme", "sme2"}, {}}, {AArch64::ARMV8A, {"sme2", "nosme"}, {}, {"sme", "sme2"}}, @@ -1858,6 +1885,10 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV8A, {"nosme2", "sme-b16b16"}, {"sme2", "sme-b16b16"}, {}}, {AArch64::ARMV8A, {"sme-b16b16", "nosme2"}, {}, {"sme2", "sme-b16b16"}}, + // sme2p1 -> {sme2p2} + {AArch64::ARMV9_6A, {"nosme2p1", "sme2p2"}, {"sme2p2", "sme2p1"}, {}}, + {AArch64::ARMV9_6A, {"sme2p2", "nosme2p1"}, {}, {"sme2p1", "sme2p2"}}, + // fp8 -> {sme-f8f16, sme-f8f32} {AArch64::ARMV8A, {"nofp8", "sme-f8f16"}, {"fp8", "sme-f8f16"}, {}}, {AArch64::ARMV8A, {"sme-f8f16", "nofp8"}, {}, {"fp8", "sme-f8f16"}}, diff --git a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp index a4c076a8752fc35c8f83c2da7d8a386d55c23bf8..9ccb13934cbd3811ff72df46a391531442c5361a 100644 --- a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp +++ b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp @@ -103,13 +103,9 @@ public: ModuleAnalysisManager::Invalidator &)); }; -template struct PGOTestName { - std::string operator()(const TestParamInfo &Info) const { - return std::get<1>(Info.param).str(); - } -}; - -struct PGOInstrumentationGenTest : public Test { +struct PGOInstrumentationGenTest + : public Test, + WithParamInterface> { ModulePassManager MPM; PassBuilder PB; MockModuleAnalysisHandle MMAHandle; @@ -145,47 +141,12 @@ struct PGOInstrumentationGenTest : public Test { } }; -struct PGOInstrumentationGenInstrumentTest - : PGOInstrumentationGenTest, - WithParamInterface> {}; - static constexpr StringRef CodeWithFuncDefs = R"( define i32 @f(i32 %n) { entry: ret i32 0 })"; -INSTANTIATE_TEST_SUITE_P( - PGOInstrumetationGenTestSuite, PGOInstrumentationGenInstrumentTest, - Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs")), - PGOTestName()); - -TEST_P(PGOInstrumentationGenInstrumentTest, Instrumented) { - const StringRef Code = std::get<0>(GetParam()); - parseAssembly(Code); - - ASSERT_THAT(M, NotNull()); - - Sequence PassSequence; - EXPECT_CALL(MMAHandle, run(Ref(*M), _)) - .InSequence(PassSequence) - .WillOnce(DoDefault()); - EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _)) - .InSequence(PassSequence) - .WillOnce(DoDefault()); - - MPM.run(*M, MAM); - - const auto *IRInstrVar = - M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)); - ASSERT_THAT(IRInstrVar, NotNull()); - EXPECT_FALSE(IRInstrVar->isDeclaration()); -} - -struct PGOInstrumentationGenIgnoreTest - : PGOInstrumentationGenTest, - WithParamInterface> {}; - static constexpr StringRef CodeWithFuncDecls = R"( declare i32 @f(i32); )"; @@ -196,26 +157,33 @@ static constexpr StringRef CodeWithGlobals = R"( )"; INSTANTIATE_TEST_SUITE_P( - PGOInstrumetationGenIgnoreTestSuite, PGOInstrumentationGenIgnoreTest, - Values(std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"), + PGOInstrumetationGenTestSuite, PGOInstrumentationGenTest, + Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs"), + std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"), std::make_tuple(CodeWithGlobals, "instrument_globals")), - PGOTestName()); + [](const TestParamInfo &Info) { + return std::get<1>(Info.param).str(); + }); -TEST_P(PGOInstrumentationGenIgnoreTest, NotInstrumented) { +TEST_P(PGOInstrumentationGenTest, Instrumented) { const StringRef Code = std::get<0>(GetParam()); - parseAssembly(Code); ASSERT_THAT(M, NotNull()); - EXPECT_CALL(MMAHandle, run(Ref(*M), _)).WillOnce(DoDefault()); - EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _)).Times(0); + Sequence PassSequence; + EXPECT_CALL(MMAHandle, run(Ref(*M), _)) + .InSequence(PassSequence) + .WillOnce(DoDefault()); + EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _)) + .InSequence(PassSequence) + .WillOnce(DoDefault()); MPM.run(*M, MAM); const auto *IRInstrVar = M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)); - ASSERT_THAT(IRInstrVar, NotNull()); + EXPECT_THAT(IRInstrVar, NotNull()); EXPECT_FALSE(IRInstrVar->isDeclaration()); } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 4926afbfc6d8ce629592da0cb86ee92ec82d0c6f..00a3c737c0e47a493f473d56ef064e564ef5dd3f 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -133,8 +133,8 @@ compound=true N2 -> N4 [ label="" ltail=cluster_N3] N4 [label = "middle.block:\l" + - " EMIT vp\<%1\> = icmp eq ir\<%N\>, vp\<%0\>\l" + - " EMIT branch-on-cond vp\<%1\>\l" + + " EMIT vp\<%cmp.n\> = icmp eq ir\<%N\>, vp\<%0\>\l" + + " EMIT branch-on-cond vp\<%cmp.n\>\l" + "Successor(s): ir-bb\, scalar.ph\l" ] N4 -> N5 [ label="T"] diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 376b00224eb5740047a7136e797361af901f2994..0f170efac207b7732627408671a8971f34985ba9 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1279,6 +1279,21 @@ TEST(VPRecipeTest, dumpRecipeInPlan) { }, testing::ExitedWithCode(0), "WIDEN ir<%a> = add ir<1>, ir<2>"); + VPDef *Def = WidenR; + EXPECT_EXIT( + { + Def->dump(); + exit(0); + }, + testing::ExitedWithCode(0), "WIDEN ir<%a> = add ir<1>, ir<2>"); + + EXPECT_EXIT( + { + WidenR->dump(); + exit(0); + }, + testing::ExitedWithCode(0), "WIDEN ir<%a> = add ir<1>, ir<2>"); + // Test VPRecipeBase::dump(). VPRecipeBase *R = WidenR; EXPECT_EXIT( diff --git a/llvm/utils/LLVMVisualizers/llvm.natvis b/llvm/utils/LLVMVisualizers/llvm.natvis index d83ae8013c51e2791a09c3976f67d2be8d525f1e..03ca2d33a80ba686915456eb8dd1046bd4a38fe7 100644 --- a/llvm/utils/LLVMVisualizers/llvm.natvis +++ b/llvm/utils/LLVMVisualizers/llvm.natvis @@ -1,408 +1,408 @@ - - - - - empty - {(value_type*)BeginX,[Size]} - {Size} elements - Uninitialized - - Size - Capacity - - Size - (value_type*)BeginX - - - - - - {U.VAL} - Cannot visualize APInts longer than 64 bits - - - {Data,[Length]} - {Length} elements - Uninitialized - - Length - - Length - Data - - - - - {(const char*)BeginX,[Size]s8} - (const char*)BeginX,[Size] - - Size - Capacity - - Size - (char*)BeginX - - - - - - {First,[Last - First]s8} - - - - {Data,[Length]s8} - Data,[Length]s8 - - Length - - Length - Data - - - - - - {($T1)*(intptr_t *)Data} - - - - - - {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} - {($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)} - {$T6::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} [{($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)}] - - ($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask) - ($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask) - - - - - {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} - {((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)} - {$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} [{((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)}] - - ($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask) - ((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask) - - - - - - {($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} - - - {($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} - - Unexpected index in PointerUnion: {(*(intptr_t *)Val.Value.Data>>$T2::InfoTy::IntShift) & $T2::InfoTy::IntMask} - - "$T4",s8b - - ($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) - - "$T5",s8b - - ($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) - - - - - - {{ empty }} - {{ head={Head} }} - - - Head - Next - this - - - - - - empty - RefPtr [1 ref] {*Obj} - RefPtr [{Obj->RefCount} refs] {*Obj} - - Obj->RefCount - Obj - - - - - {{ [Small Mode] size={NumNonEmpty}, capacity={CurArraySize} }} - {{ [Big Mode] size={NumNonEmpty}, capacity={CurArraySize} }} - - NumNonEmpty - CurArraySize - - NumNonEmpty - ($T1*)CurArray - - - - - - empty - {{ size={NumEntries}, buckets={NumBuckets} }} - - NumEntries - NumBuckets - - NumBuckets - Buckets - - - - - - {{ size={NumItems}, buckets={NumBuckets} }} - - NumItems - NumBuckets - - NumBuckets - (MapEntryTy**)TheTable - - - - - - empty - ({this+1,s8}, {second}) - - this+1,s - second - - - - - {Data} - - - - None - {Storage.value} - - Storage.value - - - - - Error - {*((storage_type *)TStorage.buffer)} - - *((storage_type *)TStorage.buffer) - *((error_type *)ErrorStorage.buffer) - - - - - - - {{little endian value = {*(($T1*)(unsigned char *)Value.buffer)} }} - - (unsigned char *)Value.buffer,1 - (unsigned char *)Value.buffer,2 - (unsigned char *)Value.buffer,4 - (unsigned char *)Value.buffer,8 - - - - - - {{ big endian value = {*(unsigned char *)Value.buffer} }} - {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 8) - | ($T1)(*((unsigned char *)Value.buffer+1))} }} - {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 24) - | (($T1)(*((unsigned char *)Value.buffer+1)) << 16) - | (($T1)(*((unsigned char *)Value.buffer+2)) << 8) - | ($T1)(*((unsigned char *)Value.buffer+3))} }} - {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 56) - | (($T1)(*((unsigned char *)Value.buffer+1)) << 48) - | (($T1)(*((unsigned char *)Value.buffer+2)) << 40) - | (($T1)(*((unsigned char *)Value.buffer+3)) << 32) - | (($T1)(*((unsigned char *)Value.buffer+4)) << 24) - | (($T1)(*((unsigned char *)Value.buffer+5)) << 16) - | (($T1)(*((unsigned char *)Value.buffer+6)) << 8) - | ($T1)(*((unsigned char *)Value.buffer+7))} }} - - (unsigned char *)Value.buffer,1 - (unsigned char *)Value.buffer,2 - (unsigned char *)Value.buffer,4 - (unsigned char *)Value.buffer,8 - - - - - {ID} - - ID - - SubclassData - - *ContainedTys - - {NumContainedTys - 1} - - - NumContainedTys - 1 - ContainedTys + 1 - - - - SubclassData == 1 - - (SubclassData & llvm::StructType::SCDB_HasBody) != 0 - (SubclassData & llvm::StructType::SCDB_Packed) != 0 - (SubclassData & llvm::StructType::SCDB_IsLiteral) != 0 - (SubclassData & llvm::StructType::SCDB_IsSized) != 0 - - {NumContainedTys} - - - NumContainedTys - ContainedTys - - - - - *ContainedTys - ((llvm::ArrayType*)this)->NumElements - - *ContainedTys - ((llvm::VectorType*)this)->ElementQuantity - - *ContainedTys - ((llvm::VectorType*)this)->ElementQuantity - - SubclassData - *ContainedTys - - Context - - - - - $(Type) {*Value} - - - - $(Type) {(llvm::ISD::NodeType)this->NodeType} - - - NumOperands - OperandList - - - - - - i{Val.BitWidth} {Val.VAL} - - - - {IDAndSubclassData >> 8}bit integer type - - - - $(Type) {*VTy} {this->getName()} {SubclassData} - $(Type) {*VTy} anon {SubclassData} - - (Instruction*)this - (User*)this - - UseList - Next - Prev.Value & 3 == 3 ? (User*)(this + 1) : (User*)(this + 2) - - - - - - - Val - - - - - - - $(Type) {*VTy} {this->getName()} {SubclassData} - $(Type) {*VTy} anon {SubclassData} - - (Value*)this,nd - *VTy - - NumUserOperands - (llvm::Use*)this - NumUserOperands - - - NumUserOperands - *((llvm::Use**)this - 1) - - - - - - {getOpcodeName(SubclassID - InstructionVal)} - - (User*)this,nd - - - - - {this->getName()} {(LinkageTypes)Linkage} {(VisibilityTypes)Visibility} {(DLLStorageClassTypes)DllStorageClass} {(llvm::GlobalValue::ThreadLocalMode) ThreadLocal} - - - - - - - this - Next - this - - - - - - - pImpl - - - - - {ModuleID,s8} {TargetTriple} - - - - $(Type) {PassID} {Kind} - - + + + + + empty + {(value_type*)BeginX,[Size]} + {Size} elements + Uninitialized + + Size + Capacity + + Size + (value_type*)BeginX + + + + + + {U.VAL} + Cannot visualize APInts longer than 64 bits + + + {Data,[Length]} + {Length} elements + Uninitialized + + Length + + Length + Data + + + + + {(const char*)BeginX,[Size]s8} + (const char*)BeginX,[Size] + + Size + Capacity + + Size + (char*)BeginX + + + + + + {First,[Last - First]s8} + + + + {Data,[Length]s8} + Data,[Length]s8 + + Length + + Length + Data + + + + + + {($T1)*(intptr_t *)Data} + + + + + + {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} + {($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)} + {$T6::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} [{($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)}] + + ($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask) + ($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask) + + + + + {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} + {((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)} + {$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} [{((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)}] + + ($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask) + ((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask) + + + + + + {($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} + + + {($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} + + Unexpected index in PointerUnion: {(*(intptr_t *)Val.Value.Data>>$T2::InfoTy::IntShift) & $T2::InfoTy::IntMask} + + "$T4",s8b + + ($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) + + "$T5",s8b + + ($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) + + + + + + {{ empty }} + {{ head={Head} }} + + + Head + Next + this + + + + + + empty + RefPtr [1 ref] {*Obj} + RefPtr [{Obj->RefCount} refs] {*Obj} + + Obj->RefCount + Obj + + + + + {{ [Small Mode] size={NumNonEmpty}, capacity={CurArraySize} }} + {{ [Big Mode] size={NumNonEmpty}, capacity={CurArraySize} }} + + NumNonEmpty + CurArraySize + + NumNonEmpty + ($T1*)CurArray + + + + + + empty + {{ size={NumEntries}, buckets={NumBuckets} }} + + NumEntries + NumBuckets + + NumBuckets + Buckets + + + + + + {{ size={NumItems}, buckets={NumBuckets} }} + + NumItems + NumBuckets + + NumBuckets + (MapEntryTy**)TheTable + + + + + + empty + ({this+1,s8}, {second}) + + this+1,s + second + + + + + {Data} + + + + None + {Storage.value} + + Storage.value + + + + + Error + {*((storage_type *)TStorage.buffer)} + + *((storage_type *)TStorage.buffer) + *((error_type *)ErrorStorage.buffer) + + + + + + + {{little endian value = {*(($T1*)(unsigned char *)Value.buffer)} }} + + (unsigned char *)Value.buffer,1 + (unsigned char *)Value.buffer,2 + (unsigned char *)Value.buffer,4 + (unsigned char *)Value.buffer,8 + + + + + + {{ big endian value = {*(unsigned char *)Value.buffer} }} + {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 8) + | ($T1)(*((unsigned char *)Value.buffer+1))} }} + {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 24) + | (($T1)(*((unsigned char *)Value.buffer+1)) << 16) + | (($T1)(*((unsigned char *)Value.buffer+2)) << 8) + | ($T1)(*((unsigned char *)Value.buffer+3))} }} + {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 56) + | (($T1)(*((unsigned char *)Value.buffer+1)) << 48) + | (($T1)(*((unsigned char *)Value.buffer+2)) << 40) + | (($T1)(*((unsigned char *)Value.buffer+3)) << 32) + | (($T1)(*((unsigned char *)Value.buffer+4)) << 24) + | (($T1)(*((unsigned char *)Value.buffer+5)) << 16) + | (($T1)(*((unsigned char *)Value.buffer+6)) << 8) + | ($T1)(*((unsigned char *)Value.buffer+7))} }} + + (unsigned char *)Value.buffer,1 + (unsigned char *)Value.buffer,2 + (unsigned char *)Value.buffer,4 + (unsigned char *)Value.buffer,8 + + + + + {ID} + + ID + + SubclassData + + *ContainedTys + + {NumContainedTys - 1} + + + NumContainedTys - 1 + ContainedTys + 1 + + + + SubclassData == 1 + + (SubclassData & llvm::StructType::SCDB_HasBody) != 0 + (SubclassData & llvm::StructType::SCDB_Packed) != 0 + (SubclassData & llvm::StructType::SCDB_IsLiteral) != 0 + (SubclassData & llvm::StructType::SCDB_IsSized) != 0 + + {NumContainedTys} + + + NumContainedTys + ContainedTys + + + + + *ContainedTys + ((llvm::ArrayType*)this)->NumElements + + *ContainedTys + ((llvm::VectorType*)this)->ElementQuantity + + *ContainedTys + ((llvm::VectorType*)this)->ElementQuantity + + SubclassData + *ContainedTys + + Context + + + + + $(Type) {*Value} + + + + $(Type) {(llvm::ISD::NodeType)this->NodeType} + + + NumOperands + OperandList + + + + + + i{Val.BitWidth} {Val.VAL} + + + + {IDAndSubclassData >> 8}bit integer type + + + + $(Type) {*VTy} {this->getName()} {SubclassData} + $(Type) {*VTy} anon {SubclassData} + + (Instruction*)this + (User*)this + + UseList + Next + Prev.Value & 3 == 3 ? (User*)(this + 1) : (User*)(this + 2) + + + + + + + Val + + + + + + + $(Type) {*VTy} {this->getName()} {SubclassData} + $(Type) {*VTy} anon {SubclassData} + + (Value*)this,nd + *VTy + + NumUserOperands + (llvm::Use*)this - NumUserOperands + + + NumUserOperands + *((llvm::Use**)this - 1) + + + + + + {getOpcodeName(SubclassID - InstructionVal)} + + (User*)this,nd + + + + + {this->getName()} {(LinkageTypes)Linkage} {(VisibilityTypes)Visibility} {(DLLStorageClassTypes)DllStorageClass} {(llvm::GlobalValue::ThreadLocalMode) ThreadLocal} + + + + + + + this + Next + this + + + + + + + pImpl + + + + + {ModuleID,s8} {TargetTriple} + + + + $(Type) {PassID} {Kind} + + diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn index 780a69f1f3299b2bbb32908233a30a87bfc46fec..1287bdd2bb8805fc9b51920620be5f2ff370fc0f 100644 --- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn @@ -21,6 +21,7 @@ unittest("ClangAnalysisFlowSensitiveTests") { "ASTOpsTest.cpp", "ArenaTest.cpp", "CFGMatchSwitchTest.cpp", + "CachedConstAccessorsLatticeTest.cpp", "ChromiumCheckModelTest.cpp", "DataflowAnalysisContextTest.cpp", "DataflowEnvironmentTest.cpp", diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn index 03e82576d67314190bd5b89424125ae6a2f202ce..f9249c208d99b79070ead1a43ed9bb9da98d1fb5 100644 --- a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn @@ -1,3 +1,14 @@ +import("//lldb/utils/TableGen/lldb_tablegen.gni") + +lldb_tablegen("DynamicLoaderDarwinProperties") { + args = [ "-gen-lldb-property-defs" ] +} + +lldb_tablegen("DynamicLoaderDarwinPropertiesEnum") { + args = [ "-gen-lldb-property-enum-defs" ] + td_file = "DynamicLoaderDarwinProperties.td" +} + static_library("MacOSX-DYLD") { output_name = "lldbPluginDynamicLoaderMacOSXDYLD" configs += [ @@ -5,6 +16,8 @@ static_library("MacOSX-DYLD") { "//llvm/utils/gn/build:lldb_code", ] deps = [ + ":DynamicLoaderDarwinProperties", + ":DynamicLoaderDarwinPropertiesEnum", "//lldb/source/Breakpoint", "//lldb/source/Core", "//lldb/source/Expression", @@ -21,6 +34,7 @@ static_library("MacOSX-DYLD") { include_dirs = [ "//lldb/source" ] sources = [ "DynamicLoaderDarwin.cpp", + "DynamicLoaderDarwinProperties.cpp", "DynamicLoaderMacOS.cpp", "DynamicLoaderMacOSXDYLD.cpp", ] diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index 5146c9a0f61dddd3b665bc69938abaca14807db1..85dfd7738c17030e3b58c47ff0bc70a3520ae93a 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -16,7 +16,9 @@ static_library("Vectorize") { "SandboxVectorizer/DependencyGraph.cpp", "SandboxVectorizer/Interval.cpp", "SandboxVectorizer/Passes/BottomUpVec.cpp", + "SandboxVectorizer/Passes/RegionsFromMetadata.cpp", "SandboxVectorizer/SandboxVectorizer.cpp", + "SandboxVectorizer/SandboxVectorizerPassBuilder.cpp", "SandboxVectorizer/SeedCollector.cpp", "VPlan.cpp", "VPlanAnalysis.cpp", diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py index 2ac44b0c0ce86e81e48be9b2ef7b60cfe3932a70..d2d719b076bc706d7d78a43740ede3e5c9ea7f7b 100755 --- a/llvm/utils/lit/lit/reports.py +++ b/llvm/utils/lit/lit/reports.py @@ -105,12 +105,20 @@ class XunitReport(object): file.write("\n") def _write_testsuite(self, file, suite, tests): - skipped = sum(1 for t in tests if t.result.code in self.skipped_codes) - failures = sum(1 for t in tests if t.isFailure()) + skipped = 0 + failures = 0 + time = 0.0 + + for t in tests: + if t.result.code in self.skipped_codes: + skipped += 1 + if t.isFailure(): + failures += 1 + time += t.result.elapsed name = suite.config.name.replace(".", "-") file.write( - f'\n' + f'\n' ) for test in tests: self._write_test(file, test, name) diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes b/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..2df17345df5b87b8e9a00bbd055f10ced29d46c8 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes @@ -0,0 +1 @@ +*.dos text eol=crlf diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos index 7a0560654c5c70edafef9ccab9a628527be48514..0f25621c787ed3d56687caf1f0ff705133c8fafd 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos @@ -1,3 +1,3 @@ -In this file, the -sequence "\r\n" -terminates lines. +In this file, the +sequence "\r\n" +terminates lines. diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py index 4a3d65b7bce4fd5323c62ffdb0d2e182f3a916ff..3a1959549e5d016956c551218a3f8543591a010f 100644 --- a/llvm/utils/lit/tests/shtest-format.py +++ b/llvm/utils/lit/tests/shtest-format.py @@ -107,7 +107,7 @@ # XUNIT: # XUNIT-NEXT: -# XUNIT-NEXT: +# XUNIT-NEXT: # XUNIT: # XUNIT-NEXT: diff --git a/llvm/utils/lit/tests/xunit-output.py b/llvm/utils/lit/tests/xunit-output.py index 67d99849fe36d90314bbc911fc7932f54689f8b8..392cded4653fe16474cf92a3da994d6eb40a77c6 100644 --- a/llvm/utils/lit/tests/xunit-output.py +++ b/llvm/utils/lit/tests/xunit-output.py @@ -9,7 +9,7 @@ # CHECK: # CHECK-NEXT: -# CHECK-NEXT: +# CHECK-NEXT: # CHECK-NEXT: # CHECK-NEXT: ]]]]> &"]]> # CHECK-NEXT: diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat index dd041d7d384ec43ab078cdf913885ee3d8431d10..3718673ae7a28d857d8851ac6fb4aa131c522f94 100755 --- a/llvm/utils/release/build_llvm_release.bat +++ b/llvm/utils/release/build_llvm_release.bat @@ -1,515 +1,515 @@ -@echo off -setlocal enabledelayedexpansion - -goto begin - -:usage -echo Script for building the LLVM installer on Windows, -echo used for the releases at https://github.com/llvm/llvm-project/releases -echo. -echo Usage: build_llvm_release.bat --version ^ [--x86,--x64, --arm64] [--skip-checkout] [--local-python] -echo. -echo Options: -echo --version: [required] version to build -echo --help: display this help -echo --x86: build and test x86 variant -echo --x64: build and test x64 variant -echo --arm64: build and test arm64 variant -echo --skip-checkout: use local git checkout instead of downloading src.zip -echo --local-python: use installed Python and does not try to use a specific version (3.10) -echo. -echo Note: At least one variant to build is required. -echo. -echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64 -exit /b 1 - -:begin - -::============================================================================== -:: parse args -set version= -set help= -set x86= -set x64= -set arm64= -set skip-checkout= -set local-python= -call :parse_args %* - -if "%help%" NEQ "" goto usage - -if "%version%" == "" ( - echo --version option is required - echo ============================= - goto usage -) - -if "%arm64%" == "" if "%x64%" == "" if "%x86%" == "" ( - echo nothing to build! - echo choose one or several variants from: --x86 --x64 --arm64 - exit /b 1 -) - -::============================================================================== -:: check prerequisites -REM Note: -REM 7zip versions 21.x and higher will try to extract the symlinks in -REM llvm's git archive, which requires running as administrator. - -REM Check 7-zip version and/or administrator permissions. -for /f "delims=" %%i in ('7z.exe ^| findstr /r "2[1-9].[0-9][0-9]"') do set version_7z=%%i -if not "%version_7z%"=="" ( - REM Unique temporary filename to use by the 'mklink' command. - set "link_name=%temp%\%username%_%random%_%random%.tmp" - - REM As the 'mklink' requires elevated permissions, the symbolic link - REM creation will fail if the script is not running as administrator. - mklink /d "!link_name!" . 1>nul 2>nul - if errorlevel 1 ( - echo. - echo Script requires administrator permissions, or a 7-zip version 20.x or older. - echo Current version is "%version_7z%" - exit /b 1 - ) else ( - REM Remove the temporary symbolic link. - rd "!link_name!" - ) -) - -REM Prerequisites: -REM -REM Visual Studio 2019, CMake, Ninja, GNUWin32, SWIG, Python 3, -REM NSIS with the strlen_8192 patch, -REM Perl (for the OpenMP run-time). -REM -REM -REM For LLDB, SWIG version 4.1.1 should be used. -REM - -:: Detect Visual Studio -set vsinstall= -set vswhere=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe - -if "%VSINSTALLDIR%" NEQ "" ( - echo using enabled Visual Studio installation - set "vsinstall=%VSINSTALLDIR%" -) else ( - echo using vswhere to detect Visual Studio installation - FOR /F "delims=" %%r IN ('^""%vswhere%" -nologo -latest -products "*" -all -property installationPath^"') DO set vsinstall=%%r -) -set "vsdevcmd=%vsinstall%\Common7\Tools\VsDevCmd.bat" - -if not exist "%vsdevcmd%" ( - echo Can't find any installation of Visual Studio - exit /b 1 -) -echo Using VS devcmd: %vsdevcmd% - -::============================================================================== -:: start echoing what we do -@echo on - -set python32_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310-32 -set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310 -set pythonarm64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python311-arm64 - -set revision=llvmorg-%version% -set package_version=%version% -set build_dir=%cd%\llvm_package_%package_version% - -echo Revision: %revision% -echo Package version: %package_version% -echo Build dir: %build_dir% -echo. - -if exist %build_dir% ( - echo Build directory already exists: %build_dir% - exit /b 1 -) -mkdir %build_dir% -cd %build_dir% || exit /b 1 - -if "%skip-checkout%" == "true" ( - echo Using local source - set llvm_src=%~dp0..\..\.. -) else ( - echo Checking out %revision% - curl -L https://github.com/llvm/llvm-project/archive/%revision%.zip -o src.zip || exit /b 1 - 7z x src.zip || exit /b 1 - mv llvm-project-* llvm-project || exit /b 1 - set llvm_src=%build_dir%\llvm-project -) - -curl -O https://gitlab.gnome.org/GNOME/libxml2/-/archive/v2.9.12/libxml2-v2.9.12.tar.gz || exit /b 1 -tar zxf libxml2-v2.9.12.tar.gz - -REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226. -REM Common flags for all builds. -set common_compiler_flags=-DLIBXML_STATIC -set common_cmake_flags=^ - -DCMAKE_BUILD_TYPE=Release ^ - -DLLVM_ENABLE_ASSERTIONS=OFF ^ - -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^ - -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" ^ - -DLLVM_BUILD_LLVM_C_DYLIB=ON ^ - -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^ - -DPython3_FIND_REGISTRY=NEVER ^ - -DPACKAGE_VERSION=%package_version% ^ - -DLLDB_RELOCATABLE_PYTHON=1 ^ - -DLLDB_EMBED_PYTHON_HOME=OFF ^ - -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^ - -DLLVM_ENABLE_LIBXML2=FORCE_ON ^ - -DLLDB_ENABLE_LIBXML2=OFF ^ - -DCLANG_ENABLE_LIBXML2=OFF ^ - -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ - -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^ - -DLLVM_ENABLE_RPMALLOC=ON ^ - -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp" - -set cmake_profile_flags="" - -REM Preserve original path -set OLDPATH=%PATH% - -REM Build the 32-bits and/or 64-bits binaries. -if "%x86%" == "true" call :do_build_32 || exit /b 1 -if "%x64%" == "true" call :do_build_64 || exit /b 1 -if "%arm64%" == "true" call :do_build_arm64 || exit /b 1 -exit /b 0 - -::============================================================================== -:: Build 32-bits binaries. -::============================================================================== -:do_build_32 -call :set_environment %python32_dir% || exit /b 1 -call "%vsdevcmd%" -arch=x86 || exit /b 1 -@echo on -mkdir build32_stage0 -cd build32_stage0 -call :do_build_libxml || exit /b 1 - -REM Stage0 binaries directory; used in stage1. -set "stage0_bin_dir=%build_dir%/build32_stage0/bin" -set cmake_flags=^ - %common_cmake_flags% ^ - -DLLVM_ENABLE_RPMALLOC=OFF ^ - -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ - -DPYTHON_HOME=%PYTHONHOME% ^ - -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ - -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib - -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -cd.. - -REM CMake expects the paths that specifies the compiler and linker to be -REM with forward slash. -set all_cmake_flags=^ - %cmake_flags% ^ - -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ - -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ - -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe -set cmake_flags=%all_cmake_flags:\=/% - -mkdir build32 -cd build32 -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -ninja package || exit /b 1 -cd .. - -exit /b 0 -::============================================================================== - -::============================================================================== -:: Build 64-bits binaries. -::============================================================================== -:do_build_64 -call :set_environment %python64_dir% || exit /b 1 -call "%vsdevcmd%" -arch=amd64 || exit /b 1 -@echo on -mkdir build64_stage0 -cd build64_stage0 -call :do_build_libxml || exit /b 1 - -REM Stage0 binaries directory; used in stage1. -set "stage0_bin_dir=%build_dir%/build64_stage0/bin" -set cmake_flags=^ - %common_cmake_flags% ^ - -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ - -DPYTHON_HOME=%PYTHONHOME% ^ - -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ - -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib - -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 -cd.. - -REM CMake expects the paths that specifies the compiler and linker to be -REM with forward slash. -set all_cmake_flags=^ - %cmake_flags% ^ - -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ - -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ - -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe -set cmake_flags=%all_cmake_flags:\=/% - - -mkdir build64 -cd build64 -call :do_generate_profile || exit /b 1 -cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 -ninja package || exit /b 1 - -:: generate tarball with install toolchain only off -set filename=clang+llvm-%version%-x86_64-pc-windows-msvc -cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^ - -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1 -ninja install || exit /b 1 -:: check llvm_config is present & returns something -%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1 -cd .. -7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz - -exit /b 0 -::============================================================================== - -::============================================================================== -:: Build arm64 binaries. -::============================================================================== -:do_build_arm64 -call :set_environment %pythonarm64_dir% || exit /b 1 -call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1 -@echo on -mkdir build_arm64_stage0 -cd build_arm64_stage0 -call :do_build_libxml || exit /b 1 - -REM Stage0 binaries directory; used in stage1. -set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin" -set cmake_flags=^ - %common_cmake_flags% ^ - -DCLANG_DEFAULT_LINKER=lld ^ - -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ - -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^ - -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DCOMPILER_RT_BUILD_PROFILE=OFF ^ - -DCOMPILER_RT_BUILD_SANITIZERS=OFF - -REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins). -cmake -GNinja %cmake_flags% ^ - -DCMAKE_C_COMPILER=clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=clang-cl.exe ^ - %llvm_src%\llvm || exit /b 1 -ninja || exit /b 1 -::ninja check-llvm || exit /b 1 -::ninja check-clang || exit /b 1 -::ninja check-lld || exit /b 1 -::ninja check-sanitizer || exit /b 1 -::ninja check-clang-tools || exit /b 1 -::ninja check-clangd || exit /b 1 -cd.. - -REM CMake expects the paths that specifies the compiler and linker to be -REM with forward slash. -REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated. -set all_cmake_flags=^ - %cmake_flags% ^ - -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ - -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ - -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^ - -DCPACK_SYSTEM_NAME=woa64 -set cmake_flags=%all_cmake_flags:\=/% - -mkdir build_arm64 -cd build_arm64 -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || exit /b 1 -REM Check but do not fail on errors. -ninja check-lldb -::ninja check-llvm || exit /b 1 -::ninja check-clang || exit /b 1 -::ninja check-lld || exit /b 1 -::ninja check-sanitizer || exit /b 1 -::ninja check-clang-tools || exit /b 1 -::ninja check-clangd || exit /b 1 -ninja package || exit /b 1 -cd .. - -exit /b 0 -::============================================================================== -:: -::============================================================================== -:: Set PATH and some environment variables. -::============================================================================== -:set_environment -REM Restore original path -set PATH=%OLDPATH% - -set python_dir=%1 - -REM Set Python environment -if "%local-python%" == "true" ( - FOR /F "delims=" %%i IN ('where python.exe ^| head -1') DO set python_exe=%%i - set PYTHONHOME=!python_exe:~0,-11! -) else ( - %python_dir%/python.exe --version || exit /b 1 - set PYTHONHOME=%python_dir% -) -set PATH=%PYTHONHOME%;%PATH% - -set "VSCMD_START_DIR=%build_dir%" - -exit /b 0 - -::============================================================================= - -::============================================================================== -:: Build libxml. -::============================================================================== -:do_build_libxml -mkdir libxmlbuild -cd libxmlbuild -cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^ - -DBUILD_SHARED_LIBS=OFF -DLIBXML2_WITH_C14N=OFF -DLIBXML2_WITH_CATALOG=OFF ^ - -DLIBXML2_WITH_DEBUG=OFF -DLIBXML2_WITH_DOCB=OFF -DLIBXML2_WITH_FTP=OFF ^ - -DLIBXML2_WITH_HTML=OFF -DLIBXML2_WITH_HTTP=OFF -DLIBXML2_WITH_ICONV=OFF ^ - -DLIBXML2_WITH_ICU=OFF -DLIBXML2_WITH_ISO8859X=OFF -DLIBXML2_WITH_LEGACY=OFF ^ - -DLIBXML2_WITH_LZMA=OFF -DLIBXML2_WITH_MEM_DEBUG=OFF -DLIBXML2_WITH_MODULES=OFF ^ - -DLIBXML2_WITH_OUTPUT=ON -DLIBXML2_WITH_PATTERN=OFF -DLIBXML2_WITH_PROGRAMS=OFF ^ - -DLIBXML2_WITH_PUSH=OFF -DLIBXML2_WITH_PYTHON=OFF -DLIBXML2_WITH_READER=OFF ^ - -DLIBXML2_WITH_REGEXPS=OFF -DLIBXML2_WITH_RUN_DEBUG=OFF -DLIBXML2_WITH_SAX1=OFF ^ - -DLIBXML2_WITH_SCHEMAS=OFF -DLIBXML2_WITH_SCHEMATRON=OFF -DLIBXML2_WITH_TESTS=OFF ^ - -DLIBXML2_WITH_THREADS=ON -DLIBXML2_WITH_THREAD_ALLOC=OFF -DLIBXML2_WITH_TREE=ON ^ - -DLIBXML2_WITH_VALID=OFF -DLIBXML2_WITH_WRITER=OFF -DLIBXML2_WITH_XINCLUDE=OFF ^ - -DLIBXML2_WITH_XPATH=OFF -DLIBXML2_WITH_XPTR=OFF -DLIBXML2_WITH_ZLIB=OFF ^ - -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^ - ../../libxml2-v2.9.12 || exit /b 1 -ninja install || exit /b 1 -set libxmldir=%cd%\install -set "libxmldir=%libxmldir:\=/%" -cd .. -exit /b 0 - -::============================================================================== -:: Generate a PGO profile. -::============================================================================== -:do_generate_profile -REM Build Clang with instrumentation. -mkdir instrument -cd instrument -cmake -GNinja %cmake_flags% -DLLVM_TARGETS_TO_BUILD=Native ^ - -DLLVM_BUILD_INSTRUMENTED=IR %llvm_src%\llvm || exit /b 1 -ninja clang || ninja clang || ninja clang || exit /b 1 -set instrumented_clang=%cd:\=/%/bin/clang-cl.exe -cd .. -REM Use that to build part of llvm to generate a profile. -mkdir train -cd train -cmake -GNinja %cmake_flags% ^ - -DCMAKE_C_COMPILER=%instrumented_clang% ^ - -DCMAKE_CXX_COMPILER=%instrumented_clang% ^ - -DLLVM_ENABLE_PROJECTS=clang ^ - -DLLVM_TARGETS_TO_BUILD=Native ^ - %llvm_src%\llvm || exit /b 1 -REM Drop profiles generated from running cmake; those are not representative. -del ..\instrument\profiles\*.profraw -ninja tools/clang/lib/Sema/CMakeFiles/obj.clangSema.dir/Sema.cpp.obj -cd .. -set profile=%cd:\=/%/profile.profdata -%stage0_bin_dir%\llvm-profdata merge -output=%profile% instrument\profiles\*.profraw || exit /b 1 -set common_compiler_flags=%common_compiler_flags% -Wno-backend-plugin -set cmake_profile_flags=-DLLVM_PROFDATA_FILE=%profile% ^ - -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ - -DCMAKE_CXX_FLAGS="%common_compiler_flags%" -exit /b 0 - -::============================================================================= -:: Parse command line arguments. -:: The format for the arguments is: -:: Boolean: --option -:: Value: --optionvalue -:: with being: space, colon, semicolon or equal sign -:: -:: Command line usage example: -:: my-batch-file.bat --build --type=release --version 123 -:: It will create 3 variables: -:: 'build' with the value 'true' -:: 'type' with the value 'release' -:: 'version' with the value '123' -:: -:: Usage: -:: set "build=" -:: set "type=" -:: set "version=" -:: -:: REM Parse arguments. -:: call :parse_args %* -:: -:: if defined build ( -:: ... -:: ) -:: if %type%=='release' ( -:: ... -:: ) -:: if %version%=='123' ( -:: ... -:: ) -::============================================================================= -:parse_args - set "arg_name=" - :parse_args_start - if "%1" == "" ( - :: Set a seen boolean argument. - if "%arg_name%" neq "" ( - set "%arg_name%=true" - ) - goto :parse_args_done - ) - set aux=%1 - if "%aux:~0,2%" == "--" ( - :: Set a seen boolean argument. - if "%arg_name%" neq "" ( - set "%arg_name%=true" - ) - set "arg_name=%aux:~2,250%" - ) else ( - set "%arg_name%=%1" - set "arg_name=" - ) - shift - goto :parse_args_start - -:parse_args_done -exit /b 0 +@echo off +setlocal enabledelayedexpansion + +goto begin + +:usage +echo Script for building the LLVM installer on Windows, +echo used for the releases at https://github.com/llvm/llvm-project/releases +echo. +echo Usage: build_llvm_release.bat --version ^ [--x86,--x64, --arm64] [--skip-checkout] [--local-python] +echo. +echo Options: +echo --version: [required] version to build +echo --help: display this help +echo --x86: build and test x86 variant +echo --x64: build and test x64 variant +echo --arm64: build and test arm64 variant +echo --skip-checkout: use local git checkout instead of downloading src.zip +echo --local-python: use installed Python and does not try to use a specific version (3.10) +echo. +echo Note: At least one variant to build is required. +echo. +echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64 +exit /b 1 + +:begin + +::============================================================================== +:: parse args +set version= +set help= +set x86= +set x64= +set arm64= +set skip-checkout= +set local-python= +call :parse_args %* + +if "%help%" NEQ "" goto usage + +if "%version%" == "" ( + echo --version option is required + echo ============================= + goto usage +) + +if "%arm64%" == "" if "%x64%" == "" if "%x86%" == "" ( + echo nothing to build! + echo choose one or several variants from: --x86 --x64 --arm64 + exit /b 1 +) + +::============================================================================== +:: check prerequisites +REM Note: +REM 7zip versions 21.x and higher will try to extract the symlinks in +REM llvm's git archive, which requires running as administrator. + +REM Check 7-zip version and/or administrator permissions. +for /f "delims=" %%i in ('7z.exe ^| findstr /r "2[1-9].[0-9][0-9]"') do set version_7z=%%i +if not "%version_7z%"=="" ( + REM Unique temporary filename to use by the 'mklink' command. + set "link_name=%temp%\%username%_%random%_%random%.tmp" + + REM As the 'mklink' requires elevated permissions, the symbolic link + REM creation will fail if the script is not running as administrator. + mklink /d "!link_name!" . 1>nul 2>nul + if errorlevel 1 ( + echo. + echo Script requires administrator permissions, or a 7-zip version 20.x or older. + echo Current version is "%version_7z%" + exit /b 1 + ) else ( + REM Remove the temporary symbolic link. + rd "!link_name!" + ) +) + +REM Prerequisites: +REM +REM Visual Studio 2019, CMake, Ninja, GNUWin32, SWIG, Python 3, +REM NSIS with the strlen_8192 patch, +REM Perl (for the OpenMP run-time). +REM +REM +REM For LLDB, SWIG version 4.1.1 should be used. +REM + +:: Detect Visual Studio +set vsinstall= +set vswhere=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe + +if "%VSINSTALLDIR%" NEQ "" ( + echo using enabled Visual Studio installation + set "vsinstall=%VSINSTALLDIR%" +) else ( + echo using vswhere to detect Visual Studio installation + FOR /F "delims=" %%r IN ('^""%vswhere%" -nologo -latest -products "*" -all -property installationPath^"') DO set vsinstall=%%r +) +set "vsdevcmd=%vsinstall%\Common7\Tools\VsDevCmd.bat" + +if not exist "%vsdevcmd%" ( + echo Can't find any installation of Visual Studio + exit /b 1 +) +echo Using VS devcmd: %vsdevcmd% + +::============================================================================== +:: start echoing what we do +@echo on + +set python32_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310-32 +set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310 +set pythonarm64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python311-arm64 + +set revision=llvmorg-%version% +set package_version=%version% +set build_dir=%cd%\llvm_package_%package_version% + +echo Revision: %revision% +echo Package version: %package_version% +echo Build dir: %build_dir% +echo. + +if exist %build_dir% ( + echo Build directory already exists: %build_dir% + exit /b 1 +) +mkdir %build_dir% +cd %build_dir% || exit /b 1 + +if "%skip-checkout%" == "true" ( + echo Using local source + set llvm_src=%~dp0..\..\.. +) else ( + echo Checking out %revision% + curl -L https://github.com/llvm/llvm-project/archive/%revision%.zip -o src.zip || exit /b 1 + 7z x src.zip || exit /b 1 + mv llvm-project-* llvm-project || exit /b 1 + set llvm_src=%build_dir%\llvm-project +) + +curl -O https://gitlab.gnome.org/GNOME/libxml2/-/archive/v2.9.12/libxml2-v2.9.12.tar.gz || exit /b 1 +tar zxf libxml2-v2.9.12.tar.gz + +REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226. +REM Common flags for all builds. +set common_compiler_flags=-DLIBXML_STATIC +set common_cmake_flags=^ + -DCMAKE_BUILD_TYPE=Release ^ + -DLLVM_ENABLE_ASSERTIONS=OFF ^ + -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^ + -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" ^ + -DLLVM_BUILD_LLVM_C_DYLIB=ON ^ + -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^ + -DPython3_FIND_REGISTRY=NEVER ^ + -DPACKAGE_VERSION=%package_version% ^ + -DLLDB_RELOCATABLE_PYTHON=1 ^ + -DLLDB_EMBED_PYTHON_HOME=OFF ^ + -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^ + -DLLVM_ENABLE_LIBXML2=FORCE_ON ^ + -DLLDB_ENABLE_LIBXML2=OFF ^ + -DCLANG_ENABLE_LIBXML2=OFF ^ + -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ + -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^ + -DLLVM_ENABLE_RPMALLOC=ON ^ + -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp" + +set cmake_profile_flags="" + +REM Preserve original path +set OLDPATH=%PATH% + +REM Build the 32-bits and/or 64-bits binaries. +if "%x86%" == "true" call :do_build_32 || exit /b 1 +if "%x64%" == "true" call :do_build_64 || exit /b 1 +if "%arm64%" == "true" call :do_build_arm64 || exit /b 1 +exit /b 0 + +::============================================================================== +:: Build 32-bits binaries. +::============================================================================== +:do_build_32 +call :set_environment %python32_dir% || exit /b 1 +call "%vsdevcmd%" -arch=x86 || exit /b 1 +@echo on +mkdir build32_stage0 +cd build32_stage0 +call :do_build_libxml || exit /b 1 + +REM Stage0 binaries directory; used in stage1. +set "stage0_bin_dir=%build_dir%/build32_stage0/bin" +set cmake_flags=^ + %common_cmake_flags% ^ + -DLLVM_ENABLE_RPMALLOC=OFF ^ + -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ + -DPYTHON_HOME=%PYTHONHOME% ^ + -DPython3_ROOT_DIR=%PYTHONHOME% ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ + -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib + +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +cd.. + +REM CMake expects the paths that specifies the compiler and linker to be +REM with forward slash. +set all_cmake_flags=^ + %cmake_flags% ^ + -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ + -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ + -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe +set cmake_flags=%all_cmake_flags:\=/% + +mkdir build32 +cd build32 +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +ninja package || exit /b 1 +cd .. + +exit /b 0 +::============================================================================== + +::============================================================================== +:: Build 64-bits binaries. +::============================================================================== +:do_build_64 +call :set_environment %python64_dir% || exit /b 1 +call "%vsdevcmd%" -arch=amd64 || exit /b 1 +@echo on +mkdir build64_stage0 +cd build64_stage0 +call :do_build_libxml || exit /b 1 + +REM Stage0 binaries directory; used in stage1. +set "stage0_bin_dir=%build_dir%/build64_stage0/bin" +set cmake_flags=^ + %common_cmake_flags% ^ + -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ + -DPYTHON_HOME=%PYTHONHOME% ^ + -DPython3_ROOT_DIR=%PYTHONHOME% ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ + -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib + +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 +cd.. + +REM CMake expects the paths that specifies the compiler and linker to be +REM with forward slash. +set all_cmake_flags=^ + %cmake_flags% ^ + -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ + -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ + -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe +set cmake_flags=%all_cmake_flags:\=/% + + +mkdir build64 +cd build64 +call :do_generate_profile || exit /b 1 +cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 +ninja package || exit /b 1 + +:: generate tarball with install toolchain only off +set filename=clang+llvm-%version%-x86_64-pc-windows-msvc +cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^ + -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1 +ninja install || exit /b 1 +:: check llvm_config is present & returns something +%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1 +cd .. +7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz + +exit /b 0 +::============================================================================== + +::============================================================================== +:: Build arm64 binaries. +::============================================================================== +:do_build_arm64 +call :set_environment %pythonarm64_dir% || exit /b 1 +call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1 +@echo on +mkdir build_arm64_stage0 +cd build_arm64_stage0 +call :do_build_libxml || exit /b 1 + +REM Stage0 binaries directory; used in stage1. +set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin" +set cmake_flags=^ + %common_cmake_flags% ^ + -DCLANG_DEFAULT_LINKER=lld ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ + -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^ + -DPython3_ROOT_DIR=%PYTHONHOME% ^ + -DCOMPILER_RT_BUILD_PROFILE=OFF ^ + -DCOMPILER_RT_BUILD_SANITIZERS=OFF + +REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins). +cmake -GNinja %cmake_flags% ^ + -DCMAKE_C_COMPILER=clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=clang-cl.exe ^ + %llvm_src%\llvm || exit /b 1 +ninja || exit /b 1 +::ninja check-llvm || exit /b 1 +::ninja check-clang || exit /b 1 +::ninja check-lld || exit /b 1 +::ninja check-sanitizer || exit /b 1 +::ninja check-clang-tools || exit /b 1 +::ninja check-clangd || exit /b 1 +cd.. + +REM CMake expects the paths that specifies the compiler and linker to be +REM with forward slash. +REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated. +set all_cmake_flags=^ + %cmake_flags% ^ + -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ + -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ + -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^ + -DCPACK_SYSTEM_NAME=woa64 +set cmake_flags=%all_cmake_flags:\=/% + +mkdir build_arm64 +cd build_arm64 +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || exit /b 1 +REM Check but do not fail on errors. +ninja check-lldb +::ninja check-llvm || exit /b 1 +::ninja check-clang || exit /b 1 +::ninja check-lld || exit /b 1 +::ninja check-sanitizer || exit /b 1 +::ninja check-clang-tools || exit /b 1 +::ninja check-clangd || exit /b 1 +ninja package || exit /b 1 +cd .. + +exit /b 0 +::============================================================================== +:: +::============================================================================== +:: Set PATH and some environment variables. +::============================================================================== +:set_environment +REM Restore original path +set PATH=%OLDPATH% + +set python_dir=%1 + +REM Set Python environment +if "%local-python%" == "true" ( + FOR /F "delims=" %%i IN ('where python.exe ^| head -1') DO set python_exe=%%i + set PYTHONHOME=!python_exe:~0,-11! +) else ( + %python_dir%/python.exe --version || exit /b 1 + set PYTHONHOME=%python_dir% +) +set PATH=%PYTHONHOME%;%PATH% + +set "VSCMD_START_DIR=%build_dir%" + +exit /b 0 + +::============================================================================= + +::============================================================================== +:: Build libxml. +::============================================================================== +:do_build_libxml +mkdir libxmlbuild +cd libxmlbuild +cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^ + -DBUILD_SHARED_LIBS=OFF -DLIBXML2_WITH_C14N=OFF -DLIBXML2_WITH_CATALOG=OFF ^ + -DLIBXML2_WITH_DEBUG=OFF -DLIBXML2_WITH_DOCB=OFF -DLIBXML2_WITH_FTP=OFF ^ + -DLIBXML2_WITH_HTML=OFF -DLIBXML2_WITH_HTTP=OFF -DLIBXML2_WITH_ICONV=OFF ^ + -DLIBXML2_WITH_ICU=OFF -DLIBXML2_WITH_ISO8859X=OFF -DLIBXML2_WITH_LEGACY=OFF ^ + -DLIBXML2_WITH_LZMA=OFF -DLIBXML2_WITH_MEM_DEBUG=OFF -DLIBXML2_WITH_MODULES=OFF ^ + -DLIBXML2_WITH_OUTPUT=ON -DLIBXML2_WITH_PATTERN=OFF -DLIBXML2_WITH_PROGRAMS=OFF ^ + -DLIBXML2_WITH_PUSH=OFF -DLIBXML2_WITH_PYTHON=OFF -DLIBXML2_WITH_READER=OFF ^ + -DLIBXML2_WITH_REGEXPS=OFF -DLIBXML2_WITH_RUN_DEBUG=OFF -DLIBXML2_WITH_SAX1=OFF ^ + -DLIBXML2_WITH_SCHEMAS=OFF -DLIBXML2_WITH_SCHEMATRON=OFF -DLIBXML2_WITH_TESTS=OFF ^ + -DLIBXML2_WITH_THREADS=ON -DLIBXML2_WITH_THREAD_ALLOC=OFF -DLIBXML2_WITH_TREE=ON ^ + -DLIBXML2_WITH_VALID=OFF -DLIBXML2_WITH_WRITER=OFF -DLIBXML2_WITH_XINCLUDE=OFF ^ + -DLIBXML2_WITH_XPATH=OFF -DLIBXML2_WITH_XPTR=OFF -DLIBXML2_WITH_ZLIB=OFF ^ + -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^ + ../../libxml2-v2.9.12 || exit /b 1 +ninja install || exit /b 1 +set libxmldir=%cd%\install +set "libxmldir=%libxmldir:\=/%" +cd .. +exit /b 0 + +::============================================================================== +:: Generate a PGO profile. +::============================================================================== +:do_generate_profile +REM Build Clang with instrumentation. +mkdir instrument +cd instrument +cmake -GNinja %cmake_flags% -DLLVM_TARGETS_TO_BUILD=Native ^ + -DLLVM_BUILD_INSTRUMENTED=IR %llvm_src%\llvm || exit /b 1 +ninja clang || ninja clang || ninja clang || exit /b 1 +set instrumented_clang=%cd:\=/%/bin/clang-cl.exe +cd .. +REM Use that to build part of llvm to generate a profile. +mkdir train +cd train +cmake -GNinja %cmake_flags% ^ + -DCMAKE_C_COMPILER=%instrumented_clang% ^ + -DCMAKE_CXX_COMPILER=%instrumented_clang% ^ + -DLLVM_ENABLE_PROJECTS=clang ^ + -DLLVM_TARGETS_TO_BUILD=Native ^ + %llvm_src%\llvm || exit /b 1 +REM Drop profiles generated from running cmake; those are not representative. +del ..\instrument\profiles\*.profraw +ninja tools/clang/lib/Sema/CMakeFiles/obj.clangSema.dir/Sema.cpp.obj +cd .. +set profile=%cd:\=/%/profile.profdata +%stage0_bin_dir%\llvm-profdata merge -output=%profile% instrument\profiles\*.profraw || exit /b 1 +set common_compiler_flags=%common_compiler_flags% -Wno-backend-plugin +set cmake_profile_flags=-DLLVM_PROFDATA_FILE=%profile% ^ + -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ + -DCMAKE_CXX_FLAGS="%common_compiler_flags%" +exit /b 0 + +::============================================================================= +:: Parse command line arguments. +:: The format for the arguments is: +:: Boolean: --option +:: Value: --optionvalue +:: with being: space, colon, semicolon or equal sign +:: +:: Command line usage example: +:: my-batch-file.bat --build --type=release --version 123 +:: It will create 3 variables: +:: 'build' with the value 'true' +:: 'type' with the value 'release' +:: 'version' with the value '123' +:: +:: Usage: +:: set "build=" +:: set "type=" +:: set "version=" +:: +:: REM Parse arguments. +:: call :parse_args %* +:: +:: if defined build ( +:: ... +:: ) +:: if %type%=='release' ( +:: ... +:: ) +:: if %version%=='123' ( +:: ... +:: ) +::============================================================================= +:parse_args + set "arg_name=" + :parse_args_start + if "%1" == "" ( + :: Set a seen boolean argument. + if "%arg_name%" neq "" ( + set "%arg_name%=true" + ) + goto :parse_args_done + ) + set aux=%1 + if "%aux:~0,2%" == "--" ( + :: Set a seen boolean argument. + if "%arg_name%" neq "" ( + set "%arg_name%=true" + ) + set "arg_name=%aux:~2,250%" + ) else ( + set "%arg_name%=%1" + set "arg_name=" + ) + shift + goto :parse_args_start + +:parse_args_done +exit /b 0 diff --git a/llvm/utils/revert_checker.py b/llvm/utils/revert_checker.py index da80bdff868571dc51bf1403a1e3b6ff03f6a894..b1c6e228e4d4199801e1d425c27e1f81803b5dd5 100755 --- a/llvm/utils/revert_checker.py +++ b/llvm/utils/revert_checker.py @@ -45,35 +45,78 @@ import logging import re import subprocess import sys -from typing import Generator, List, NamedTuple, Iterable +from typing import Dict, Generator, Iterable, List, NamedTuple, Optional, Tuple assert sys.version_info >= (3, 6), "Only Python 3.6+ is supported." # People are creative with their reverts, and heuristics are a bit difficult. -# Like 90% of of reverts have "This reverts commit ${full_sha}". -# Some lack that entirely, while others have many of them specified in ad-hoc -# ways, while others use short SHAs and whatever. +# At a glance, most reverts have "This reverts commit ${full_sha}". Many others +# have `Reverts llvm/llvm-project#${PR_NUMBER}`. # -# The 90% case is trivial to handle (and 100% free + automatic). The extra 10% -# starts involving human intervention, which is probably not worth it for now. +# By their powers combined, we should be able to automatically catch something +# like 80% of reverts with reasonable confidence. At some point, human +# intervention will always be required (e.g., I saw +# ``` +# This reverts commit ${commit_sha_1} and +# also ${commit_sha_2_shorthand} +# ``` +# during my sample) + +_CommitMessageReverts = NamedTuple( + "_CommitMessageReverts", + [ + ("potential_shas", List[str]), + ("potential_pr_numbers", List[int]), + ], +) + +def _try_parse_reverts_from_commit_message( + commit_message: str, +) -> _CommitMessageReverts: + """Tries to parse revert SHAs and LLVM PR numbers form the commit message. -def _try_parse_reverts_from_commit_message(commit_message: str) -> List[str]: + Returns: + A namedtuple containing: + - A list of potentially reverted SHAs + - A list of potentially reverted LLVM PR numbers + """ if not commit_message: - return [] + return _CommitMessageReverts([], []) - results = re.findall(r"This reverts commit ([a-f0-9]{40})\b", commit_message) + sha_reverts = re.findall( + r"This reverts commit ([a-f0-9]{40})\b", + commit_message, + ) first_line = commit_message.splitlines()[0] initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line) if initial_revert: - results.append(initial_revert.group(1)) - return results + sha_reverts.append(initial_revert.group(1)) + pr_numbers = [ + int(x) + for x in re.findall( + r"Reverts llvm/llvm-project#(\d+)", + commit_message, + ) + ] + + return _CommitMessageReverts( + potential_shas=sha_reverts, + potential_pr_numbers=pr_numbers, + ) -def _stream_stdout(command: List[str]) -> Generator[str, None, None]: + +def _stream_stdout( + command: List[str], cwd: Optional[str] = None +) -> Generator[str, None, None]: with subprocess.Popen( - command, stdout=subprocess.PIPE, encoding="utf-8", errors="replace" + command, + cwd=cwd, + stdout=subprocess.PIPE, + encoding="utf-8", + errors="replace", ) as p: assert p.stdout is not None # for mypy's happiness. yield from p.stdout @@ -175,10 +218,43 @@ def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str: ).strip() -def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]: +def _load_pr_commit_mappings( + git_dir: str, root: str, min_ref: str +) -> Dict[int, List[str]]: + git_log = ["git", "log", "--format=%H %s", f"{min_ref}..{root}"] + results = collections.defaultdict(list) + pr_regex = re.compile(r"\s\(#(\d+)\)$") + for line in _stream_stdout(git_log, cwd=git_dir): + m = pr_regex.search(line) + if not m: + continue + + pr_number = int(m.group(1)) + sha = line.split(None, 1)[0] + # N.B., these are kept in log (read: reverse chronological) order, + # which is what's expected by `find_reverts`. + results[pr_number].append(sha) + return results + + +# N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be +# enough for the 99% case of reverts: rarely should someone land a cleanish +# revert of a >6 month old change... +def find_reverts( + git_dir: str, across_ref: str, root: str, max_pr_lookback: int = 20000 +) -> List[Revert]: """Finds reverts across `across_ref` in `git_dir`, starting from `root`. These reverts are returned in order of oldest reverts first. + + Args: + git_dir: git directory to find reverts in. + across_ref: the ref to find reverts across. + root: the 'main' ref to look for reverts on. + max_pr_lookback: this function uses heuristics to map PR numbers to + SHAs. These heuristics require that commit history from `root` to + `some_parent_of_root` is loaded in memory. `max_pr_lookback` is how + many commits behind `across_ref` should be loaded in memory. """ across_sha = _rev_parse(git_dir, across_ref) root_sha = _rev_parse(git_dir, root) @@ -201,8 +277,41 @@ def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]: ) all_reverts = [] + # Lazily load PR <-> commit mappings, since it can be expensive. + pr_commit_mappings = None for sha, commit_message in _log_stream(git_dir, root_sha, across_sha): - reverts = _try_parse_reverts_from_commit_message(commit_message) + reverts, pr_reverts = _try_parse_reverts_from_commit_message( + commit_message, + ) + if pr_reverts: + if pr_commit_mappings is None: + logging.info( + "Loading PR <-> commit mappings. This may take a moment..." + ) + pr_commit_mappings = _load_pr_commit_mappings( + git_dir, root_sha, f"{across_sha}~{max_pr_lookback}" + ) + logging.info( + "Loaded %d PR <-> commit mappings", len(pr_commit_mappings) + ) + + for reverted_pr_number in pr_reverts: + reverted_shas = pr_commit_mappings.get(reverted_pr_number) + if not reverted_shas: + logging.warning( + "No SHAs for reverted PR %d (commit %s)", + reverted_pr_number, + sha, + ) + continue + logging.debug( + "Inferred SHAs %s for reverted PR %d (commit %s)", + reverted_shas, + reverted_pr_number, + sha, + ) + reverts.extend(reverted_shas) + if not reverts: continue diff --git a/llvm/utils/revert_checker_test.py b/llvm/utils/revert_checker_test.py index 9d992663c5be8afef71f3b8d77f227b4af6a029d..c149be8dc0dd1955a4206a835dd6fc3894ab53e0 100755 --- a/llvm/utils/revert_checker_test.py +++ b/llvm/utils/revert_checker_test.py @@ -96,6 +96,7 @@ class Test(unittest.TestCase): git_dir=get_llvm_project_path(), across_ref="c9944df916e41b1014dff5f6f75d52297b48ecdc~", root="c9944df916e41b1014dff5f6f75d52297b48ecdc", + max_pr_lookback=50, ) self.assertEqual(reverts, []) @@ -113,6 +114,7 @@ class Test(unittest.TestCase): git_dir=get_llvm_project_path(), across_ref="c47f971694be0159ffddfee8a75ae515eba91439", root="9f981e9adf9c8d29bb80306daf08d2770263ade6", + max_pr_lookback=50, ) self.assertEqual( reverts, @@ -128,6 +130,27 @@ class Test(unittest.TestCase): ], ) + def test_pr_based_revert_works(self) -> None: + reverts = revert_checker.find_reverts( + git_dir=get_llvm_project_path(), + # This SHA is a direct child of the reverted SHA expected below. + across_ref="2d5f3b0a61fb171617012a2c3ba05fd31fb3bb1d", + # This SHA is a direct child of the revert SHA listed below. + root="2c01b278580212914ec037bb5dd9b73702dfe7f1", + max_pr_lookback=50, + ) + self.assertEqual( + reverts, + [ + revert_checker.Revert( + # This SHA is a `Reverts ${PR}` for #111004. + sha="50866e84d1da8462aeb96607bf6d9e5bbd5869c5", + # ...And this was the commit for #111004. + reverted_sha="67160c5ab5f5b7fd5fa7851abcfde367c8a9f91b", + ), + ], + ) + if __name__ == "__main__": unittest.main() diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h index aee64475171a4321cbf6ac154cce4683ae038f3c..e866ac518dbbcb20088561e6dc125332c689b86a 100644 --- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h @@ -70,6 +70,10 @@ std::unique_ptr createArithUnsignedWhenEquivalentPass(); void populateIntRangeOptimizationsPatterns(RewritePatternSet &patterns, DataFlowSolver &solver); +/// Replace signed ops with unsigned ones where they are proven equivalent. +void populateUnsignedWhenEquivalentPatterns(RewritePatternSet &patterns, + DataFlowSolver &solver); + /// Create a pass which do optimizations based on integer range analysis. std::unique_ptr createIntRangeOptimizationsPass(); diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td index 0e38325f9891ac81e107509087e93e74054c77f6..e81db32bcaad0336be4a59dd5800f69dddb041a6 100644 --- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td @@ -71,6 +71,7 @@ class ArmSME_IntrOp immArgPositions=*/immArgPositions, /*list immArgAttrNames=*/immArgAttrNames>; diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index a683a905cd2d6bf2f60372dd079ccf4d483f5343..cc5463ea968fc3d3341a5a8c06c8361fbb40b9b3 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -536,6 +536,8 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> { Option<"unknownTypeConversion", "unknown-type-conversion", "std::string", /*default=*/"\"fully-dynamic-layout-map\"", "Controls layout maps for non-inferrable memref types.">, + Option<"bufferAlignment", "buffer-alignment", "uint64_t", /*default=*/"64", + "Sets the alignment of newly allocated buffers.">, ]; let constructor = "mlir::bufferization::createOneShotBufferizePass()"; diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td index 27a2b418aadb2a4b4cf394e79bd9d29b4ffad727..ea82f7f7b8e124852940bf608bd02ddc9ccb1bfd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td @@ -59,6 +59,8 @@ def LLVM_Dialect : Dialect { static StringRef getStructRetAttrName() { return "llvm.sret"; } static StringRef getWriteOnlyAttrName() { return "llvm.writeonly"; } static StringRef getZExtAttrName() { return "llvm.zeroext"; } + static StringRef getOpBundleSizesAttrName() { return "op_bundle_sizes"; } + static StringRef getOpBundleTagsAttrName() { return "op_bundle_tags"; } // TODO Restrict the usage of this to parameter attributes once there is an // alternative way of modeling memory effects on FunctionOpInterface. /// Name of the attribute that will cause the creation of a readnone memory diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index ab40c8ec4b65885b667fd5847dd73b02d3d846f6..845c88b1be775095fcdc7662bc5259aeab540545 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -120,7 +120,8 @@ def LLVM_Log2Op : LLVM_UnaryIntrOpF<"log2">; def LLVM_LogOp : LLVM_UnaryIntrOpF<"log">; def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0], /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0, - /*immArgPositions=*/[1, 2, 3], /*immArgAttrNames=*/["rw", "hint", "cache"] + /*requiresOpBundles=*/0, /*immArgPositions=*/[1, 2, 3], + /*immArgAttrNames=*/["rw", "hint", "cache"] > { let arguments = (ins LLVM_AnyPointer:$addr, I32Attr:$rw, I32Attr:$hint, I32Attr:$cache); } @@ -176,7 +177,8 @@ class LLVM_MemcpyIntrOpBase : DeclareOpInterfaceMethods, DeclareOpInterfaceMethods], /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1, - /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> { + /*requiresOpBundles=*/0, /*immArgPositions=*/[3], + /*immArgAttrNames=*/["isVolatile"]> { dag args = (ins Arg:$dst, Arg:$src, AnySignlessInteger:$len, I1Attr:$isVolatile); @@ -206,7 +208,8 @@ def LLVM_MemcpyInlineOp : DeclareOpInterfaceMethods, DeclareOpInterfaceMethods], /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1, - /*immArgPositions=*/[2, 3], /*immArgAttrNames=*/["len", "isVolatile"]> { + /*requiresOpBundles=*/0, /*immArgPositions=*/[2, 3], + /*immArgAttrNames=*/["len", "isVolatile"]> { dag args = (ins Arg:$dst, Arg:$src, APIntAttr:$len, I1Attr:$isVolatile); @@ -232,7 +235,8 @@ def LLVM_MemsetOp : LLVM_ZeroResultIntrOp<"memset", [0, 2], DeclareOpInterfaceMethods, DeclareOpInterfaceMethods], /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1, - /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> { + /*requiresOpBundles=*/0, /*immArgPositions=*/[3], + /*immArgAttrNames=*/["isVolatile"]> { dag args = (ins Arg:$dst, I8:$val, AnySignlessInteger:$len, I1Attr:$isVolatile); // Append the alias attributes defined by LLVM_IntrOpBase. @@ -286,7 +290,8 @@ def LLVM_NoAliasScopeDeclOp class LLVM_LifetimeBaseOp : LLVM_ZeroResultIntrOp], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0, - /*immArgPositions=*/[0], /*immArgAttrNames=*/["size"]> { + /*requiresOpBundles=*/0, /*immArgPositions=*/[0], + /*immArgAttrNames=*/["size"]> { let arguments = (ins I64Attr:$size, LLVM_AnyPointer:$ptr); let assemblyFormat = "$size `,` $ptr attr-dict `:` qualified(type($ptr))"; } @@ -306,7 +311,8 @@ def LLVM_InvariantStartOp : LLVM_OneResultIntrOp<"invariant.start", [], [1], def LLVM_InvariantEndOp : LLVM_ZeroResultIntrOp<"invariant.end", [2], [DeclareOpInterfaceMethods], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0, - /*immArgPositions=*/[1], /*immArgAttrNames=*/["size"]> { + /*requiresOpBundles=*/0, /*immArgPositions=*/[1], + /*immArgAttrNames=*/["size"]> { let arguments = (ins LLVM_DefaultPointer:$start, I64Attr:$size, LLVM_AnyPointer:$ptr); @@ -368,7 +374,7 @@ class LLVM_ConstrainedIntr mlirOperands; SmallVector mlirAttrs; if (failed(moduleImport.convertIntrinsicArguments( - llvmOperands.take_front( }] # numArgs # [{), + llvmOperands.take_front( }] # numArgs # [{), {}, false, {}, {}, mlirOperands, mlirAttrs))) { return failure(); } @@ -429,7 +435,26 @@ def LLVM_USHLSat : LLVM_BinarySameArgsIntrOpI<"ushl.sat">; // def LLVM_AssumeOp - : LLVM_ZeroResultIntrOp<"assume", []>, Arguments<(ins I1:$cond)>; + : LLVM_ZeroResultIntrOp<"assume", /*overloadedOperands=*/[], /*traits=*/[], + /*requiresAccessGroup=*/0, + /*requiresAliasAnalysis=*/0, + /*requiresOpBundles=*/1> { + dag args = (ins I1:$cond); + let arguments = !con(args, opBundleArgs); + + let assemblyFormat = [{ + $cond + ( custom($op_bundle_operands, type($op_bundle_operands), + $op_bundle_tags)^ )? + `:` type($cond) attr-dict + }]; + + let builders = [ + OpBuilder<(ins "Value":$cond)> + ]; + + let hasVerifier = 1; +} def LLVM_SSACopyOp : LLVM_OneResultIntrOp<"ssa.copy", [], [0], [Pure, SameOperandsAndResultType]> { @@ -992,7 +1017,8 @@ def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">; def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap", /*overloadedOperands=*/[], /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0, - /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> { + /*requiresOpBundles=*/0, /*immArgPositions=*/[0], + /*immArgAttrNames=*/["failureKind"]> { let arguments = (ins I8Attr:$failureKind); } diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td index c3d352d8d0dd4827f17571a003383b07f40aa97c..a38dafa4d9cf3413d94d136dd1c1180b4d70b6b9 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td @@ -291,7 +291,7 @@ class LLVM_IntrOpBase overloadedResults, list overloadedOperands, list traits, int numResults, bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0, - bit requiresFastmath = 0, + bit requiresFastmath = 0, bit requiresOpBundles = 0, list immArgPositions = [], list immArgAttrNames = []> : LLVM_OpBase:$noalias_scopes, OptionalAttr:$tbaa), (ins ))); + dag opBundleArgs = !if(!gt(requiresOpBundles, 0), + (ins VariadicOfVariadic:$op_bundle_operands, + DenseI32ArrayAttr:$op_bundle_sizes, + OptionalAttr:$op_bundle_tags), + (ins )); string llvmEnumName = enumName; string overloadedResultsCpp = "{" # !interleave(overloadedResults, ", ") # "}"; string overloadedOperandsCpp = "{" # !interleave(overloadedOperands, ", ") # "}"; @@ -336,6 +342,8 @@ class LLVM_IntrOpBase mlirAttrs; if (failed(moduleImport.convertIntrinsicArguments( llvmOperands, + llvmOpBundles, + }] # !if(!gt(requiresOpBundles, 0), "true", "false") # [{, }] # immArgPositionsCpp # [{, }] # immArgAttrNamesCpp # [{, mlirOperands, @@ -381,12 +389,14 @@ class LLVM_IntrOp overloadedResults, list overloadedOperands, list traits, int numResults, bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0, bit requiresFastmath = 0, + bit requiresOpBundles = 0, list immArgPositions = [], list immArgAttrNames = []> : LLVM_IntrOpBase; + requiresFastmath, requiresOpBundles, immArgPositions, + immArgAttrNames>; // Base class for LLVM intrinsic operations returning no results. Places the // intrinsic into the LLVM dialect and prefixes its name with "intr.". @@ -406,11 +416,13 @@ class LLVM_ZeroResultIntrOp overloadedOperands = [], list traits = [], bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0, + bit requiresOpBundles = 0, list immArgPositions = [], list immArgAttrNames = []> : LLVM_IntrOp; + /*requiresFastMath=*/0, requiresOpBundles, immArgPositions, + immArgAttrNames>; // Base class for LLVM intrinsic operations returning one result. Places the // intrinsic into the LLVM dialect and prefixes its name with "intr.". This is @@ -422,11 +434,12 @@ class LLVM_OneResultIntrOp overloadedResults = [], list overloadedOperands = [], list traits = [], bit requiresFastmath = 0, - list immArgPositions = [], - list immArgAttrNames = []> + list immArgPositions = [], + list immArgAttrNames = []> : LLVM_IntrOp; + requiresFastmath, /*requiresOpBundles=*/0, immArgPositions, + immArgAttrNames>; def LLVM_OneResultOpBuilder : OpBuilder<(ins "Type":$resultType, "ValueRange":$operands, diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index bbca7bc7286acb4c5266dc40faf9ffd2652ddbf6..d5def510a904d3de71dac1587153150711406dc7 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -559,11 +559,7 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [ VariadicOfVariadic:$op_bundle_operands, DenseI32ArrayAttr:$op_bundle_sizes, - DefaultValuedProperty< - ArrayProperty, - "ArrayRef{}", - "SmallVector{}" - >:$op_bundle_tags); + OptionalAttr:$op_bundle_tags); let results = (outs Optional:$result); let successors = (successor AnySuccessor:$normalDest, AnySuccessor:$unwindDest); @@ -678,11 +674,7 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call", VariadicOfVariadic:$op_bundle_operands, DenseI32ArrayAttr:$op_bundle_sizes, - DefaultValuedProperty< - ArrayProperty, - "ArrayRef{}", - "SmallVector{}" - >:$op_bundle_tags); + OptionalAttr:$op_bundle_tags); // Append the aliasing related attributes defined in LLVM_MemAccessOpBase. let arguments = !con(args, aliasAttrs); let results = (outs Optional:$result); @@ -1930,11 +1922,7 @@ def LLVM_CallIntrinsicOp VariadicOfVariadic:$op_bundle_operands, DenseI32ArrayAttr:$op_bundle_sizes, - DefaultValuedProperty< - ArrayProperty, - "ArrayRef{}", - "SmallVector{}" - >:$op_bundle_tags); + OptionalAttr:$op_bundle_tags); let results = (outs Optional:$results); let llvmBuilder = [{ return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation); diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 152715f281088e243dc6373272cff3221b822684..5806295cedb198c414813ac4bd36ea54ef99a84b 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -139,9 +139,22 @@ class NVVM_SpecialRangeableRegisterOp traits = []> } //===----------------------------------------------------------------------===// -// Lane index and range +// Lane, Warp, SM, Grid index and range def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">; def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">; +def NVVM_WarpIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpid">; +def NVVM_WarpDimOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nwarpid">; +def NVVM_SmIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.smid">; +def NVVM_SmDimOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nsmid">; +def NVVM_GridIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.gridid">; + +//===----------------------------------------------------------------------===// +// Lane Mask Comparison Ops +def NVVM_LaneMaskEqOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.eq">; +def NVVM_LaneMaskLeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.le">; +def NVVM_LaneMaskLtOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.lt">; +def NVVM_LaneMaskGeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.ge">; +def NVVM_LaneMaskGtOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.gt">; //===----------------------------------------------------------------------===// // Thread index and range @@ -189,6 +202,13 @@ def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nct // Clock registers def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">; def NVVM_Clock64Op : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock64">; +def NVVM_GlobalTimerOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.globaltimer">; + +//===----------------------------------------------------------------------===// +// envreg registers +foreach index = !range(0, 32) in { + def NVVM_EnvReg # index # Op : NVVM_SpecialRegisterOp<"read.ptx.sreg.envreg" # index>; +} //===----------------------------------------------------------------------===// // NVVM approximate op definitions diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index b80d9ae88910c43c27cda72e774b0cb9fc01b818..3695708439d91f1008fcddd17031beb67b76c85f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -98,7 +98,7 @@ class ROCDL_IntrOp overloadedResults, LLVM_IntrOpBase; + requiresAliasAnalysis, 0, 0, immArgPositions, immArgAttrNames>; //===----------------------------------------------------------------------===// // ROCDL special register op definitions @@ -297,6 +297,24 @@ def ROCDL_SchedBarrier : ROCDL_IntrOp<"sched.barrier", [], [], [], 0>, "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_sched_barrier,builder.getInt32(op.getMask()));"; } +def ROCDL_SchedGroupBarrier : ROCDL_IntrOp<"sched.group.barrier", [], [], [], 0>, + Arguments<(ins I32Attr:$mask, I32Attr:$size, I32Attr:$groupId)> { + let results = (outs); + let assemblyFormat = "$mask `,` $size `,` $groupId attr-dict"; + string llvmBuilder = [{ + createIntrinsicCall(builder, + llvm::Intrinsic::amdgcn_sched_group_barrier, + {builder.getInt32(op.getMask()), builder.getInt32(op.getSize()), builder.getInt32(op.getGroupId())}); + }]; +} + +def ROCDL_IglpOpt : ROCDL_IntrOp<"iglp.opt", [], [], [], 0>, + Arguments<(ins I32Attr:$variant)> { + let results = (outs); + let assemblyFormat = "$variant attr-dict"; + string llvmBuilder = + "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_iglp_opt,builder.getInt32(op.getVariant()));"; +} //===---------------------------------------------------------------------===// // Xdlops intrinsics diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index f9036cf96e9a1d29ebd6709740581ba8b610902d..0915bbde3072b01442d9675aecab531148b34827 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -1028,7 +1028,7 @@ def PadOp : Op:$pad_to_multiple_of, DefaultValuedOptionalAttr: $static_pad_to_multiple_of, - DefaultValuedAttr:$pack_paddings, + DefaultValuedAttr:$nofold_flags, DefaultValuedAttr< TypedArrayAttrBase, "{}">:$transpose_paddings, @@ -1055,13 +1055,13 @@ def PadOp : Op":$paddingDimensions, CArg<"ArrayRef", "{}">:$staticPadToMultipleOf, - CArg<"ArrayRef", "{}">:$packPaddings, + CArg<"ArrayRef", "{}">:$nofoldFlags, CArg<"ArrayRef", "{}">:$transposePaddings, CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>, OpBuilder<(ins "Value":$target, "ArrayRef":$paddingDimensions, "ArrayRef":$mixedPadToMultipleOf, - CArg<"ArrayRef", "{}">:$packPaddings, + CArg<"ArrayRef", "{}">:$nofoldFlags, CArg<"ArrayRef", "{}">:$transposePaddings, CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)> ]; diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 0693e31b4f70afa1ab692bff8393f8d6e5e6f7a9..70b086641bdc18d81c4aa4576da31205cab6546d 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -296,9 +296,9 @@ struct LinalgPaddingOptions { } /// A flag for every operand to mark the PadOp as nofold which enables /// packing for statically shaped operands. - SmallVector packPaddings; - LinalgPaddingOptions &setPackPaddings(ArrayRef pp) { - packPaddings.assign(pp.begin(), pp.end()); + SmallVector nofoldFlags; + LinalgPaddingOptions &setNofoldFlags(ArrayRef pp) { + nofoldFlags.assign(pp.begin(), pp.end()); return *this; } /// A number of loops to hoist the PadOp out for every operand. @@ -530,7 +530,7 @@ void peelLoops(RewriterBase &rewriter, ArrayRef loops); /// /// * "options.padToMultipleOf" indicates that each padding dimension should be /// padded to the specified multiple. -/// * Use "options.paddingValues" and "options.packPaddings" to set padding +/// * Use "options.paddingValues" and "options.nofoldFlags" to set padding /// value and nofold attribute of the created tensor::PadOps, respectively. /// * The unpadded results (extracted slice of the cloned operation) are /// returned via `replacements`. diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 11649ef2d03370a19524f4c50d6a46832262e1fd..45313200d4f0b9746d0767056e471b991cb55b87 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -137,7 +137,7 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove, RecipeInterface]> } }]; - let hasVerifier = 1; + let hasRegionVerifier = 1; } //===----------------------------------------------------------------------===// @@ -175,6 +175,7 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [ }]; let hasVerifier = 1; + let hasRegionVerifier = 1; } def TerminatorOp : OpenMP_Op<"terminator", [Terminator, Pure]> { @@ -426,6 +427,7 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [ }]; let hasVerifier = 1; + let hasRegionVerifier = 1; } //===----------------------------------------------------------------------===// @@ -479,6 +481,7 @@ def SimdOp : OpenMP_Op<"simd", traits = [ }]; let hasVerifier = 1; + let hasRegionVerifier = 1; } @@ -556,6 +559,7 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [ }]; let hasVerifier = 1; + let hasRegionVerifier = 1; } //===----------------------------------------------------------------------===// @@ -693,6 +697,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [ }] # clausesExtraClassDeclaration; let hasVerifier = 1; + let hasRegionVerifier = 1; } def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [ diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td index 22521b08637cf8c17e2fd9f7a5218369b6c31987..8b72689dc3fd87f66b82afaa2c939dee720c3412 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td @@ -258,6 +258,7 @@ def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> { let verify = [{ return ::llvm::cast<::mlir::omp::LoopWrapperInterface>($_op).verifyImpl(); }]; + let verifyWithRegions = 1; } def ComposableOpInterface : OpInterface<"ComposableOpInterface"> { diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 07402c8695b382e04b5b16a869bab1ce3978c4ed..3bb5ceb0f4695ba2a54413db7b2492f0d3bd1f05 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1877,21 +1877,23 @@ def Tosa_RescaleOp: Tosa_Op<"rescale", [Pure, let description = [{ Rescale quantized values into a new domain. Supported rescalings are: - Mode Input Output - signed 8 to 8 int8 int8 - signed 8 to 16 int8 int16 - signed 8 to 32 int8 int32 - signed 16 to 8 int16 int8 - signed 16 to 16 int16 int16 - signed 16 to 32 int16 int32 - signed 32 to 8 int32 int8 - signed 32 to 16 int32 int16 - signed 32 to 32 int32 int32 - signed 48 to 8 int48 int8 - signed 48 to 16 int48 int16 - signed 48 to 32 int48 int32 - unsigned 8 to signed 8 uint8 int8 - signed 8 to unsigned 8 int8 uint8 + + | Mode | Input | Output | + |------------------------|-------|--------| + | signed 8 to 8 | int8 | int8 | + | signed 8 to 16 | int8 | int16 | + | signed 8 to 32 | int8 | int32 | + | signed 16 to 8 | int16 | int8 | + | signed 16 to 16 | int16 | int16 | + | signed 16 to 32 | int16 | int32 | + | signed 32 to 8 | int32 | int8 | + | signed 32 to 16 | int32 | int16 | + | signed 32 to 32 | int32 | int32 | + | signed 48 to 8 | int48 | int8 | + | signed 48 to 16 | int48 | int16 | + | signed 48 to 32 | int48 | int32 | + | unsigned 8 to signed 8 | uint8 | int8 | + | signed 8 to unsigned 8 | int8 | uint8 | }]; let arguments = (ins diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index b0de7c11b9d43692a6b3a956e49c015d94d5158c..c02b16ea931706dde60504cf0e109bbe7c44e7fc 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -2808,7 +2808,7 @@ def Vector_SplatOp : Vector_Op<"splat", [ ```mlir %s = arith.constant 10.1 : f32 - %t = vector.splat %s : vector<8x16xi32> + %t = vector.splat %s : vector<8x16xf32> ``` }]; diff --git a/mlir/include/mlir/IR/BuiltinAttributes.td b/mlir/include/mlir/IR/BuiltinAttributes.td index 530ba7d2f11e5cc8b0b80e9f962d666f959e81ca..492b8309a5ea3356a8dcbe09c324771216fcfb1c 100644 --- a/mlir/include/mlir/IR/BuiltinAttributes.td +++ b/mlir/include/mlir/IR/BuiltinAttributes.td @@ -702,6 +702,7 @@ def Builtin_IntegerAttr : Builtin_Attr<"Integer", "integer", } // TODO: Avoid implicit trunc? + // See https://github.com/llvm/llvm-project/issues/112510. IntegerType intTy = ::llvm::cast(type); APInt apValue(intTy.getWidth(), value, intTy.isSignedInteger(), /*implicitTrunc=*/true); diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h index cb30bb3f59688aefca29a7eabdf6915fbd0abb33..8429325412dc97897c55b57d160deaf878d26c42 100644 --- a/mlir/include/mlir/IR/Diagnostics.h +++ b/mlir/include/mlir/IR/Diagnostics.h @@ -183,7 +183,8 @@ public: Diagnostic &operator<<(StringAttr val); /// Stream in a string literal. - Diagnostic &operator<<(const char *val) { + template + Diagnostic &operator<<(const char (&val)[n]) { arguments.push_back(DiagnosticArgument(val)); return *this; } diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h index d17bbac81655b59f3cdd21441c101c5a19db34fd..ef8dab87f131a1d0fa408fe38bda29fb749fcca9 100644 --- a/mlir/include/mlir/IR/MLIRContext.h +++ b/mlir/include/mlir/IR/MLIRContext.h @@ -197,6 +197,11 @@ public: /// operations. ArrayRef getRegisteredOperations(); + /// Return a sorted array containing the information for registered operations + /// filtered by dialect name. + ArrayRef + getRegisteredOperationsByDialect(StringRef dialectName); + /// Return true if this operation name is registered in this context. bool isOperationRegistered(StringRef name); diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h index 47bcfc9bbd4f96ad86ee76bb48276faf92ac04b6..4fcbeff9df560934bc0cb5170472e304bc825009 100644 --- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h +++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h @@ -244,6 +244,10 @@ inferReturnTensorTypes(ArrayRef retComponents, /// Verifies that the inferred result types match the actual result types for /// the op. Precondition: op implements InferTypeOpInterface. LogicalResult verifyInferredResultTypes(Operation *op); + +/// Report a fatal error indicating that the result types could not be +/// inferred. +void reportFatalInferReturnTypesError(OperationState &state); } // namespace detail namespace OpTrait { diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h index 9f300bcafea53737d71ba0809555cfdcf5af8164..bbb7af58d273937053d5e78814a81018070181eb 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h @@ -243,6 +243,8 @@ public: /// corresponding MLIR attribute names. LogicalResult convertIntrinsicArguments(ArrayRef values, + ArrayRef opBundles, + bool requiresOpBundles, ArrayRef immArgPositions, ArrayRef immArgAttrNames, SmallVectorImpl &valuesOut, diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp index e628fb152b52f835805baa9993102a1b1e047dcf..0d6ff2fd908db9d849425bf4dd6193b0091ed4f6 100644 --- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp +++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp @@ -892,8 +892,8 @@ FlatLinearValueConstraints::FlatLinearValueConstraints(IntegerSet set, set.getNumDims() + set.getNumSymbols() + 1, set.getNumDims(), set.getNumSymbols(), /*numLocals=*/0) { - assert(operands.empty() || - set.getNumInputs() == operands.size() && "operand count mismatch"); + assert((operands.empty() || set.getNumInputs() == operands.size()) && + "operand count mismatch"); // Set the values for the non-local variables. for (unsigned i = 0, e = operands.size(); i < e; ++i) setValue(i, operands[i]); diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp index 4be0e06fe2a5e58a2e69ad525baaf32fe2691d1f..fddd7c51bfbc8745a65f6b5c4a8968309c83799d 100644 --- a/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp @@ -40,11 +40,11 @@ arith::NarrowTypeEmulationConverter::NarrowTypeEmulationConverter( addConversion([this](FunctionType ty) -> std::optional { SmallVector inputs; if (failed(convertTypes(ty.getInputs(), inputs))) - return std::nullopt; + return nullptr; SmallVector results; if (failed(convertTypes(ty.getResults(), results))) - return std::nullopt; + return nullptr; return FunctionType::get(ty.getContext(), inputs, results); }); diff --git a/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp b/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp index 4edce84bafd4160fe0a7a4d6d358229b3b374686..bebe0b5a7c0b616b0d3b81c518e5fe8ebdb764dc 100644 --- a/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp @@ -13,7 +13,8 @@ #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" #include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h" #include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Transforms/DialectConversion.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" namespace mlir { namespace arith { @@ -29,6 +30,9 @@ using namespace mlir::dataflow; /// Succeeds when a value is statically non-negative in that it has a lower /// bound on its value (if it is treated as signed) and that bound is /// non-negative. +// TODO: IntegerRangeAnalysis internally assumes index is 64bit and this pattern +// relies on this. These transformations may not be valid for 32bit index, +// need more investigation. static LogicalResult staticallyNonNegative(DataFlowSolver &solver, Value v) { auto *result = solver.lookupState(v); if (!result || result->getValue().isUninitialized()) @@ -85,35 +89,60 @@ static CmpIPredicate toUnsignedPred(CmpIPredicate pred) { } namespace { +class DataFlowListener : public RewriterBase::Listener { +public: + DataFlowListener(DataFlowSolver &s) : s(s) {} + +protected: + void notifyOperationErased(Operation *op) override { + s.eraseState(s.getProgramPointAfter(op)); + for (Value res : op->getResults()) + s.eraseState(res); + } + + DataFlowSolver &s; +}; + template -struct ConvertOpToUnsigned : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; +struct ConvertOpToUnsigned final : OpRewritePattern { + ConvertOpToUnsigned(MLIRContext *context, DataFlowSolver &s) + : OpRewritePattern(context), solver(s) {} - LogicalResult matchAndRewrite(Signed op, typename Signed::Adaptor adaptor, - ConversionPatternRewriter &rw) const override { - rw.replaceOpWithNewOp(op, op->getResultTypes(), - adaptor.getOperands(), op->getAttrs()); + LogicalResult matchAndRewrite(Signed op, PatternRewriter &rw) const override { + if (failed( + staticallyNonNegative(this->solver, static_cast(op)))) + return failure(); + + rw.replaceOpWithNewOp(op, op->getResultTypes(), op->getOperands(), + op->getAttrs()); return success(); } + +private: + DataFlowSolver &solver; }; -struct ConvertCmpIToUnsigned : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; +struct ConvertCmpIToUnsigned final : OpRewritePattern { + ConvertCmpIToUnsigned(MLIRContext *context, DataFlowSolver &s) + : OpRewritePattern(context), solver(s) {} + + LogicalResult matchAndRewrite(CmpIOp op, PatternRewriter &rw) const override { + if (failed(isCmpIConvertable(this->solver, op))) + return failure(); - LogicalResult matchAndRewrite(CmpIOp op, CmpIOpAdaptor adaptor, - ConversionPatternRewriter &rw) const override { rw.replaceOpWithNewOp(op, toUnsignedPred(op.getPredicate()), op.getLhs(), op.getRhs()); return success(); } + +private: + DataFlowSolver &solver; }; struct ArithUnsignedWhenEquivalentPass : public arith::impl::ArithUnsignedWhenEquivalentBase< ArithUnsignedWhenEquivalentPass> { - /// Implementation structure: first find all equivalent ops and collect them, - /// then perform all the rewrites in a second pass over the target op. This - /// ensures that analysis results are not invalidated during rewriting. + void runOnOperation() override { Operation *op = getOperation(); MLIRContext *ctx = op->getContext(); @@ -123,35 +152,32 @@ struct ArithUnsignedWhenEquivalentPass if (failed(solver.initializeAndRun(op))) return signalPassFailure(); - ConversionTarget target(*ctx); - target.addLegalDialect(); - target.addDynamicallyLegalOp( - [&solver](Operation *op) -> std::optional { - return failed(staticallyNonNegative(solver, op)); - }); - target.addDynamicallyLegalOp( - [&solver](CmpIOp op) -> std::optional { - return failed(isCmpIConvertable(solver, op)); - }); + DataFlowListener listener(solver); RewritePatternSet patterns(ctx); - patterns.add, - ConvertOpToUnsigned, - ConvertOpToUnsigned, - ConvertOpToUnsigned, - ConvertOpToUnsigned, - ConvertOpToUnsigned, - ConvertOpToUnsigned, ConvertCmpIToUnsigned>( - ctx); - - if (failed(applyPartialConversion(op, target, std::move(patterns)))) { + populateUnsignedWhenEquivalentPatterns(patterns, solver); + + GreedyRewriteConfig config; + config.listener = &listener; + + if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config))) signalPassFailure(); - } } }; } // end anonymous namespace +void mlir::arith::populateUnsignedWhenEquivalentPatterns( + RewritePatternSet &patterns, DataFlowSolver &solver) { + patterns.add, + ConvertOpToUnsigned, + ConvertOpToUnsigned, + ConvertOpToUnsigned, + ConvertOpToUnsigned, + ConvertOpToUnsigned, + ConvertOpToUnsigned, ConvertCmpIToUnsigned>( + patterns.getContext(), solver); +} + std::unique_ptr mlir::arith::createArithUnsignedWhenEquivalentPass() { return std::make_unique(); } diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp index 875d8c40e92cc1612e96a03c98ec21ee7148d882..1d009b03754c52b3b288bf26352632d30c8c6aca 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -224,6 +224,7 @@ struct OneShotBufferizePass }; } opt.printConflicts = printConflicts; + opt.bufferAlignment = bufferAlignment; opt.testAnalysisOnly = testAnalysisOnly; opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries; opt.checkParallelRegions = checkParallelRegions; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 12ed8cc88ae7b7c4d78b5c26eb911b0156520a4d..cc73878a64ff67eaf5912fe4a934f8e09af9f8b5 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -241,13 +241,18 @@ static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands, static void printOpBundles(OpAsmPrinter &p, Operation *op, OperandRangeRange opBundleOperands, TypeRangeRange opBundleOperandTypes, - ArrayRef opBundleTags) { + std::optional opBundleTags) { + if (opBundleOperands.empty()) + return; + assert(opBundleTags && "expect operand bundle tags"); + p << "["; llvm::interleaveComma( - llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p, + llvm::zip(opBundleOperands, opBundleOperandTypes, *opBundleTags), p, [&p](auto bundle) { + auto bundleTag = cast(std::get<2>(bundle)).getValue(); printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle), - std::get<2>(bundle)); + bundleTag); }); p << "]"; } @@ -256,7 +261,7 @@ static ParseResult parseOneOpBundle( OpAsmParser &p, SmallVector> &opBundleOperands, SmallVector> &opBundleOperandTypes, - SmallVector &opBundleTags) { + SmallVector &opBundleTags) { SMLoc currentParserLoc = p.getCurrentLocation(); SmallVector operands; SmallVector types; @@ -276,7 +281,7 @@ static ParseResult parseOneOpBundle( opBundleOperands.push_back(std::move(operands)); opBundleOperandTypes.push_back(std::move(types)); - opBundleTags.push_back(std::move(tag)); + opBundleTags.push_back(StringAttr::get(p.getContext(), tag)); return success(); } @@ -285,16 +290,17 @@ static std::optional parseOpBundles( OpAsmParser &p, SmallVector> &opBundleOperands, SmallVector> &opBundleOperandTypes, - SmallVector &opBundleTags) { + ArrayAttr &opBundleTags) { if (p.parseOptionalLSquare()) return std::nullopt; if (succeeded(p.parseOptionalRSquare())) return success(); + SmallVector opBundleTagAttrs; auto bundleParser = [&] { return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes, - opBundleTags); + opBundleTagAttrs); }; if (p.parseCommaSeparatedList(bundleParser)) return failure(); @@ -302,6 +308,8 @@ static std::optional parseOpBundles( if (p.parseRSquare()) return failure(); + opBundleTags = ArrayAttr::get(p.getContext(), opBundleTagAttrs); + return success(); } @@ -1039,7 +1047,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results, /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr, /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr, - /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt, + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); } @@ -1066,7 +1074,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr, /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr, - /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt, + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); } @@ -1079,7 +1087,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr, /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr, /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr, - /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt, + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); } @@ -1092,7 +1100,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func, /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr, /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr, /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr, - /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt, + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); } @@ -1192,12 +1200,20 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) { template static LogicalResult verifyOperandBundles(OpType &op) { OperandRangeRange opBundleOperands = op.getOpBundleOperands(); - ArrayRef opBundleTags = op.getOpBundleTags(); + std::optional opBundleTags = op.getOpBundleTags(); - if (opBundleTags.size() != opBundleOperands.size()) + auto isStringAttr = [](Attribute tagAttr) { + return isa(tagAttr); + }; + if (opBundleTags && !llvm::all_of(*opBundleTags, isStringAttr)) + return op.emitError("operand bundle tag must be a StringAttr"); + + size_t numOpBundles = opBundleOperands.size(); + size_t numOpBundleTags = opBundleTags ? opBundleTags->size() : 0; + if (numOpBundles != numOpBundleTags) return op.emitError("expected ") - << opBundleOperands.size() - << " operand bundle tags, but actually got " << opBundleTags.size(); + << numOpBundles << " operand bundle tags, but actually got " + << numOpBundleTags; return success(); } @@ -1329,7 +1345,8 @@ void CallOp::print(OpAsmPrinter &p) { {getCalleeAttrName(), getTailCallKindAttrName(), getVarCalleeTypeAttrName(), getCConvAttrName(), getOperandSegmentSizesAttrName(), - getOpBundleSizesAttrName()}); + getOpBundleSizesAttrName(), + getOpBundleTagsAttrName()}); p << " : "; if (!isDirect) @@ -1437,7 +1454,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) { SmallVector operands; SmallVector> opBundleOperands; SmallVector> opBundleOperandTypes; - SmallVector opBundleTags; + ArrayAttr opBundleTags; // Default to C Calling Convention if no keyword is provided. result.addAttribute( @@ -1483,9 +1500,9 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) { parser, opBundleOperands, opBundleOperandTypes, opBundleTags); result && failed(*result)) return failure(); - if (!opBundleTags.empty()) - result.getOrAddProperties().op_bundle_tags = - std::move(opBundleTags); + if (opBundleTags && !opBundleTags.empty()) + result.addAttribute(CallOp::getOpBundleTagsAttrName(result.name).getValue(), + opBundleTags); if (parser.parseOptionalAttrDict(result.attributes)) return failure(); @@ -1525,8 +1542,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func, auto calleeType = func.getFunctionType(); build(builder, state, getCallOpResultTypes(calleeType), getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops, - normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal, - unwind); + normalOps, unwindOps, nullptr, nullptr, {}, {}, normal, unwind); } void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys, @@ -1535,7 +1551,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys, ValueRange unwindOps) { build(builder, state, tys, /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr, - nullptr, {}, std::nullopt, normal, unwind); + nullptr, {}, {}, normal, unwind); } void InvokeOp::build(OpBuilder &builder, OperationState &state, @@ -1544,7 +1560,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, Block *unwind, ValueRange unwindOps) { build(builder, state, getCallOpResultTypes(calleeType), getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps, - nullptr, nullptr, {}, std::nullopt, normal, unwind); + nullptr, nullptr, {}, {}, normal, unwind); } SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) { @@ -1634,7 +1650,8 @@ void InvokeOp::print(OpAsmPrinter &p) { p.printOptionalAttrDict((*this)->getAttrs(), {getCalleeAttrName(), getOperandSegmentSizeAttr(), getCConvAttrName(), getVarCalleeTypeAttrName(), - getOpBundleSizesAttrName()}); + getOpBundleSizesAttrName(), + getOpBundleTagsAttrName()}); p << " : "; if (!isDirect) @@ -1657,7 +1674,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) { TypeAttr varCalleeType; SmallVector> opBundleOperands; SmallVector> opBundleOperandTypes; - SmallVector opBundleTags; + ArrayAttr opBundleTags; Block *normalDest, *unwindDest; SmallVector normalOperands, unwindOperands; Builder &builder = parser.getBuilder(); @@ -1703,9 +1720,10 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) { parser, opBundleOperands, opBundleOperandTypes, opBundleTags); result && failed(*result)) return failure(); - if (!opBundleTags.empty()) - result.getOrAddProperties().op_bundle_tags = - std::move(opBundleTags); + if (opBundleTags && !opBundleTags.empty()) + result.addAttribute( + InvokeOp::getOpBundleTagsAttrName(result.name).getValue(), + opBundleTags); if (parser.parseOptionalAttrDict(result.attributes)) return failure(); @@ -3333,7 +3351,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state, mlir::StringAttr intrin, mlir::ValueRange args) { build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args, FastmathFlagsAttr{}, - /*op_bundle_operands=*/{}); + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}); } void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state, @@ -3341,14 +3359,14 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state, mlir::LLVM::FastmathFlagsAttr fastMathFlags) { build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args, fastMathFlags, - /*op_bundle_operands=*/{}); + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}); } void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state, mlir::Type resultType, mlir::StringAttr intrin, mlir::ValueRange args) { build(builder, state, {resultType}, intrin, args, FastmathFlagsAttr{}, - /*op_bundle_operands=*/{}); + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}); } void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state, @@ -3356,7 +3374,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state, mlir::StringAttr intrin, mlir::ValueRange args, mlir::LLVM::FastmathFlagsAttr fastMathFlags) { build(builder, state, resultTypes, intrin, args, fastMathFlags, - /*op_bundle_operands=*/{}); + /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}); } //===----------------------------------------------------------------------===// @@ -3413,6 +3431,18 @@ void InlineAsmOp::getEffects( } } +//===----------------------------------------------------------------------===// +// AssumeOp (intrinsic) +//===----------------------------------------------------------------------===// + +void LLVM::AssumeOp::build(OpBuilder &builder, OperationState &state, + mlir::Value cond) { + return build(builder, state, cond, /*op_bundle_operands=*/{}, + /*op_bundle_tags=*/{}); +} + +LogicalResult LLVM::AssumeOp::verify() { return verifyOperandBundles(*this); } + //===----------------------------------------------------------------------===// // masked_gather (intrinsic) //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 8e7621754f76bff6c45b527b831f45a73e2fe3ac..ad72b5d7beccde739c58168c9e2c711a9eacfa17 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1727,7 +1727,7 @@ transform::PackTransposeOp::apply(transform::TransformRewriter &rewriter, void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target, ArrayRef paddingDimensions, ArrayRef padToMultipleOf, - ArrayRef packPaddings, + ArrayRef nofoldFlags, ArrayRef transposePaddings, StringRef copyBackOp) { auto resultType = transform::AnyOpType::get(b.getContext()); @@ -1742,7 +1742,7 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target, (padToMultipleOf.empty() ? DenseI64ArrayAttr() : b.getDenseI64ArrayAttr(padToMultipleOf)), - /*packPaddings=*/b.getI64ArrayAttr(packPaddings), + /*nofoldFlags=*/b.getI64ArrayAttr(nofoldFlags), /*transposePaddings=*/b.getArrayAttr(transposePaddings), /*copyBackOp=*/b.getStringAttr(copyBackOp)); } @@ -1750,7 +1750,7 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target, void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target, ArrayRef paddingDimensions, ArrayRef mixedPadToMultipleOf, - ArrayRef packPaddings, + ArrayRef nofoldFlags, ArrayRef transposePaddings, StringRef copyBackOp) { auto resultType = transform::AnyOpType::get(b.getContext()); @@ -1766,7 +1766,7 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target, /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions), /*padToMultipleOf=*/dynamicPadToMultipleOf, /*padToMultipleOf=*/staticPadToMultipleOf, - /*packPaddings=*/b.getI64ArrayAttr(packPaddings), + /*nofoldFlags=*/b.getI64ArrayAttr(nofoldFlags), /*transposePaddings=*/b.getArrayAttr(transposePaddings), /*copyBackOp=*/b.getStringAttr(copyBackOp)); } @@ -1800,10 +1800,10 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter, } // Convert the integer packing flags to booleans. - SmallVector packPaddings; + SmallVector nofoldFlags; for (int64_t packPadding : - extractFromIntegerArrayAttr(getPackPaddings())) - packPaddings.push_back(static_cast(packPadding)); + extractFromIntegerArrayAttr(getNofoldFlags())) + nofoldFlags.push_back(static_cast(packPadding)); // Convert the padding values to attributes. SmallVector paddingValues; @@ -1861,7 +1861,7 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter, options.padToMultipleOf = padToMultipleOf; options.paddingValues = paddingValues; - options.packPaddings = packPaddings; + options.nofoldFlags = nofoldFlags; if (getCopyBackOp() == bufferization::MaterializeInDestinationOp::getOperationName()) { options.copyBackOp = LinalgPaddingOptions::CopyBackOp:: @@ -1907,14 +1907,14 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter, } LogicalResult transform::PadOp::verify() { - SmallVector packPaddings = - extractFromIntegerArrayAttr(getPackPaddings()); - if (any_of(packPaddings, [](int64_t packPadding) { + SmallVector nofoldFlags = + extractFromIntegerArrayAttr(getNofoldFlags()); + if (any_of(nofoldFlags, [](int64_t packPadding) { return packPadding != 0 && packPadding != 1; })) { return emitOpError() - << "expects pack_paddings to contain booleans (0/1), found " - << getPackPaddings(); + << "expects nofold_flags to contain booleans (0/1), found " + << getNofoldFlags(); } SmallVector paddingDimensions = diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp index a066c44408915e19de1f54dd3bdda106ad5e09ab..9a685f6dc96acc82171c29acf3b790ce407bbbd8 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp @@ -88,7 +88,7 @@ static LogicalResult computePaddedShape(linalg::LinalgOp opToPad, } /// Pad the `opOperand` in the "paddingDimensions" using the padding value and -/// the nofold flag found in "paddingValues" and "packPaddings", respectively. +/// the nofold flag found in "paddingValues" and "nofoldFlags", respectively. /// /// Exit early and return the `opOperand` value if it already has the requested /// shape. i.e.: @@ -117,8 +117,8 @@ static FailureOr padOperandToSmallestStaticBoundingBox( // Return the unpadded operand if padding to a static shape is not needed and // if the nofold flag is not set. - bool nofold = opOperand->getOperandNumber() < options.packPaddings.size() - ? options.packPaddings[opOperand->getOperandNumber()] + bool nofold = opOperand->getOperandNumber() < options.nofoldFlags.size() + ? bool(options.nofoldFlags[opOperand->getOperandNumber()]) : false; if (!nofold && alreadyHasRequestedShape) return opOperand->get(); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 09c6b2683b438805d0196597fdca1d7028875e8e..635273bcbc0208508d095095907242b90ee83517 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -840,11 +840,11 @@ enum VectorMemoryAccessKind { ScalarBroadcast, Contiguous, Gather }; /// TODO: Statically shaped loops + vector masking static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) { SmallVector loopRanges = linalgOp.getStaticLoopRanges(); - assert(linalgOp.hasDynamicShape() || - llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == - 1 && - "For statically shaped Linalg Ops, only one " - "non-unit loop dim is expected"); + assert( + (linalgOp.hasDynamicShape() || + llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == 1) && + "For statically shaped Linalg Ops, only one " + "non-unit loop dim is expected"); size_t idx = loopRanges.size() - 1; for (; idx >= 0; idx--) diff --git a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp index 40c83487fd47d5eb7de036cc0289dff08f7ac20d..27e89d69e214d67350a2cdfc83e34f7fa8e86b87 100644 --- a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp +++ b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp @@ -148,8 +148,9 @@ void MLProgramPipelineGlobals::processBlock( if (auto store = mlir::dyn_cast(op)) { auto ref = store.getGlobal(); symbolStore.insert(ref); - if (previousStores.contains(ref)) { - toDelete.push_back(previousStores.find(ref)->getSecond()); + auto it = previousStores.find(ref); + if (it != previousStores.end()) { + toDelete.push_back(it->getSecond()); } previousLoads[ref] = store.getValue(); diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp index 9efea066a03c854801de66a8e7a25a6da7058b68..28f9061d9873b7272ff2465dace7e4024ce4e8d1 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @@ -169,8 +169,9 @@ struct ConvertMemRefAllocation final : OpConversionPattern { std::is_same(), "expected only memref::AllocOp or memref::AllocaOp"); auto currentType = cast(op.getMemref().getType()); - auto newResultType = dyn_cast( - this->getTypeConverter()->convertType(op.getType())); + auto newResultType = + this->getTypeConverter()->template convertType( + op.getType()); if (!newResultType) { return rewriter.notifyMatchFailure( op->getLoc(), @@ -378,7 +379,7 @@ struct ConvertMemRefReinterpretCast final matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { MemRefType newTy = - dyn_cast(getTypeConverter()->convertType(op.getType())); + getTypeConverter()->convertType(op.getType()); if (!newTy) { return rewriter.notifyMatchFailure( op->getLoc(), @@ -466,8 +467,8 @@ struct ConvertMemRefSubview final : OpConversionPattern { LogicalResult matchAndRewrite(memref::SubViewOp subViewOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - MemRefType newTy = dyn_cast( - getTypeConverter()->convertType(subViewOp.getType())); + MemRefType newTy = + getTypeConverter()->convertType(subViewOp.getType()); if (!newTy) { return rewriter.notifyMatchFailure( subViewOp->getLoc(), @@ -632,14 +633,14 @@ void memref::populateMemRefNarrowTypeEmulationConversions( SmallVector strides; int64_t offset; if (failed(getStridesAndOffset(ty, strides, offset))) - return std::nullopt; + return nullptr; if (!strides.empty() && strides.back() != 1) - return std::nullopt; + return nullptr; auto newElemTy = IntegerType::get(ty.getContext(), loadStoreWidth, intTy.getSignedness()); if (!newElemTy) - return std::nullopt; + return nullptr; StridedLayoutAttr layoutAttr; // If the offset is 0, we do not need a strided layout as the stride is diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp index bc4535f97acf04bc4ba50b731cdc2c3a8b1bced6..49b71625291db94075fb80574ee86a0fdd89ef63 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp @@ -159,7 +159,7 @@ void memref::populateMemRefWideIntEmulationConversions( Type newElemTy = typeConverter.convertType(intTy); if (!newElemTy) - return std::nullopt; + return nullptr; return ty.cloneWith(std::nullopt, newElemTy); }); diff --git a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp index fb2921fec9f79d055b3718ba8d71b14cf94cbe86..792e7229183064ea1f59f753b872146ed6e82033 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp @@ -18,6 +18,7 @@ #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -131,11 +132,25 @@ struct IterArgsToInitArgs : public OpRewritePattern { auto blockArg = dyn_cast(dimOp.getSource()); if (!blockArg) return failure(); - auto loopLikeOp = - dyn_cast(blockArg.getParentBlock()->getParentOp()); - if (!loopLikeOp) + // TODO: Enable this for loopLikeInterface. Restricting for scf.for + // because the init args shape might change in the loop body. + // For e.g.: + // ``` + // %0 = tensor.empty(%c1) : tensor + // %r = scf.for %iv = %c0 to %c10 step %c1 iter_args(%arg0 = %0) -> + // tensor { + // %1 = tensor.dim %arg0, %c0 : tensor + // %2 = arith.addi %c1, %1 : index + // %3 = tensor.empty(%2) : tensor + // scf.yield %3 : tensor + // } + // + // ``` + auto forAllOp = + dyn_cast(blockArg.getParentBlock()->getParentOp()); + if (!forAllOp) return failure(); - Value initArg = loopLikeOp.getTiedLoopInit(blockArg)->get(); + Value initArg = forAllOp.getTiedLoopInit(blockArg)->get(); rewriter.modifyOpInPlace( dimOp, [&]() { dimOp.getSourceMutable().assign(initArg); }); return success(); diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index c6c6edb8f999fc724b183184aa0dae33a185cbb5..e1df647d6a3c717798b4637c075431432088aed0 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1760,6 +1760,18 @@ static LogicalResult verifyPrivateVarList(OpType &op) { } LogicalResult ParallelOp::verify() { + if (getAllocateVars().size() != getAllocatorVars().size()) + return emitError( + "expected equal sizes for allocate and allocator variables"); + + if (failed(verifyPrivateVarList(*this))) + return failure(); + + return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(), + getReductionByref()); +} + +LogicalResult ParallelOp::verifyRegions() { auto distributeChildOps = getOps(); if (!distributeChildOps.empty()) { if (!isComposite()) @@ -1780,16 +1792,7 @@ LogicalResult ParallelOp::verify() { return emitError() << "'omp.composite' attribute present in non-composite operation"; } - - if (getAllocateVars().size() != getAllocatorVars().size()) - return emitError( - "expected equal sizes for allocate and allocator variables"); - - if (failed(verifyPrivateVarList(*this))) - return failure(); - - return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(), - getReductionByref()); + return success(); } //===----------------------------------------------------------------------===// @@ -1979,6 +1982,11 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state, } LogicalResult WsloopOp::verify() { + return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(), + getReductionByref()); +} + +LogicalResult WsloopOp::verifyRegions() { bool isCompositeChildLeaf = llvm::dyn_cast_if_present((*this)->getParentOp()); @@ -2000,8 +2008,7 @@ LogicalResult WsloopOp::verify() { << "'omp.composite' attribute missing from composite wrapper"; } - return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(), - getReductionByref()); + return success(); } //===----------------------------------------------------------------------===// @@ -2012,14 +2019,16 @@ void SimdOp::build(OpBuilder &builder, OperationState &state, const SimdOperands &clauses) { MLIRContext *ctx = builder.getContext(); // TODO Store clauses in op: linearVars, linearStepVars, privateVars, - // privateSyms, reductionVars, reductionByref, reductionSyms. + // privateSyms. SimdOp::build(builder, state, clauses.alignedVars, makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr, /*linear_vars=*/{}, /*linear_step_vars=*/{}, clauses.nontemporalVars, clauses.order, clauses.orderMod, /*private_vars=*/{}, /*private_syms=*/nullptr, - /*reduction_vars=*/{}, /*reduction_byref=*/nullptr, - /*reduction_syms=*/nullptr, clauses.safelen, clauses.simdlen); + clauses.reductionVars, + makeDenseBoolArrayAttr(ctx, clauses.reductionByref), + makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen, + clauses.simdlen); } LogicalResult SimdOp::verify() { @@ -2035,9 +2044,6 @@ LogicalResult SimdOp::verify() { if (verifyNontemporalClause(*this, getNontemporalVars()).failed()) return failure(); - if (getNestedWrapper()) - return emitOpError() << "must wrap an 'omp.loop_nest' directly"; - bool isCompositeChildLeaf = llvm::dyn_cast_if_present((*this)->getParentOp()); @@ -2052,6 +2058,13 @@ LogicalResult SimdOp::verify() { return success(); } +LogicalResult SimdOp::verifyRegions() { + if (getNestedWrapper()) + return emitOpError() << "must wrap an 'omp.loop_nest' directly"; + + return success(); +} + //===----------------------------------------------------------------------===// // Distribute construct [2.9.4.1] //===----------------------------------------------------------------------===// @@ -2074,6 +2087,10 @@ LogicalResult DistributeOp::verify() { return emitError( "expected equal sizes for allocate and allocator variables"); + return success(); +} + +LogicalResult DistributeOp::verifyRegions() { if (LoopWrapperInterface nested = getNestedWrapper()) { if (!isComposite()) return emitError() @@ -2279,6 +2296,10 @@ LogicalResult TaskloopOp::verify() { "may not appear on the same taskloop directive"); } + return success(); +} + +LogicalResult TaskloopOp::verifyRegions() { if (LoopWrapperInterface nested = getNestedWrapper()) { if (!isComposite()) return emitError() @@ -2723,7 +2744,7 @@ void PrivateClauseOp::build(OpBuilder &odsBuilder, OperationState &odsState, DataSharingClauseType::Private)); } -LogicalResult PrivateClauseOp::verify() { +LogicalResult PrivateClauseOp::verifyRegions() { Type symType = getType(); auto verifyTerminator = [&](Operation *terminator, diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp index 83c9cf69ba03647173fd0a627a1316f905c1f491..1b458f410af6014e685ce56f76ca30263a18301b 100644 --- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp @@ -642,22 +642,25 @@ LogicalResult LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter, llvm::SmallVector &returnValues) { Location loc = forOp.getLoc(); + Type t = lb.getType(); + // Emit different versions of the induction variable. They will be // removed by dead code if not used. - // bounds_range = ub - lb - // total_iterations = (bounds_range + step - 1) / step - Type t = lb.getType(); - Value zero = - rewriter.create(loc, rewriter.getIntegerAttr(t, 0)); - Value one = - rewriter.create(loc, rewriter.getIntegerAttr(t, 1)); - Value minusOne = - rewriter.create(loc, rewriter.getIntegerAttr(t, -1)); + auto createConst = [&](int v) { + return rewriter.create(loc, + rewriter.getIntegerAttr(t, v)); + }; + + // total_iterations = cdiv(range_diff, step); + // - range_diff = ub - lb + // - total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step + Value zero = createConst(0); + Value one = createConst(1); Value stepLessZero = rewriter.create( loc, arith::CmpIPredicate::slt, step, zero); Value stepDecr = - rewriter.create(loc, stepLessZero, one, minusOne); + rewriter.create(loc, stepLessZero, one, createConst(-1)); Value rangeDiff = rewriter.create(loc, ub, lb); Value rangeIncrStep = rewriter.create(loc, rangeDiff, step); @@ -665,25 +668,31 @@ LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter, rewriter.create(loc, rangeIncrStep, stepDecr); Value totalIterations = rewriter.create(loc, rangeDecr, step); + // If total_iters < max_stage, start the epilogue at zero to match the + // ramp-up in the prologue. + // start_iter = max(0, total_iters - max_stage) + Value iterI = rewriter.create(loc, totalIterations, + createConst(maxStage)); + iterI = rewriter.create(loc, zero, iterI); + + // Capture predicates for dynamic loops. SmallVector predicates(maxStage + 1); - for (int64_t i = 0; i < maxStage; i++) { - // iterI = total_iters - 1 - i - // May go negative... - Value minusI = - rewriter.create(loc, rewriter.getIntegerAttr(t, -i)); - Value iterI = rewriter.create( - loc, rewriter.create(loc, totalIterations, minusOne), - minusI); + + for (int64_t i = 1; i <= maxStage; i++) { // newLastIter = lb + step * iterI Value newlastIter = rewriter.create( loc, lb, rewriter.create(loc, step, iterI)); - setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i); + setValueMapping(forOp.getInductionVar(), newlastIter, i); + + // increment to next iterI + iterI = rewriter.create(loc, iterI, one); if (dynamicLoop) { - // pred = iterI >= 0 - predicates[i + 1] = rewriter.create( - loc, arith::CmpIPredicate::sge, iterI, zero); + // Disable stages when `i` is greater than total_iters. + // pred = total_iters >= i + predicates[i] = rewriter.create( + loc, arith::CmpIPredicate::sge, totalIterations, createConst(i)); } } diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp index e0b91f323b0e64f3e5c7281a770cd713c77c7690..5c16e538ac242048bdd90a1b8b65e3af95df6f09 100644 --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -27,9 +27,9 @@ PadOp mlir::tensor::createPadHighOp(RankedTensorType resType, Value source, OpBuilder &b, SmallVector dynOutDims) { - assert((resType.getNumDynamicDims() == dynOutDims.size()) || - dynOutDims.empty() && - "Either none or all output dynamic dims must be specified!"); + assert(((resType.getNumDynamicDims() == dynOutDims.size()) || + dynOutDims.empty()) && + "Either none or all output dynamic dims must be specified!"); // Init "low" and "high" padding values ("low" is kept as is, "high" is // computed below). diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp index 5359432a04f3e572db76709e4cf16b467536dda6..5397fbabc5c95eafb84ac6df03ee123ea6972a95 100644 --- a/mlir/lib/IR/Builders.cpp +++ b/mlir/lib/IR/Builders.cpp @@ -269,6 +269,7 @@ IntegerAttr Builder::getIntegerAttr(Type type, int64_t value) { if (type.isIndex()) return IntegerAttr::get(type, APInt(64, value)); // TODO: Avoid implicit trunc? + // See https://github.com/llvm/llvm-project/issues/112510. return IntegerAttr::get(type, APInt(type.getIntOrFloatBitWidth(), value, type.isSignedInteger(), /*implicitTrunc=*/true)); diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp index f05666fcde207b82fdbc9559df8a9649f696c1d0..d33340f4aefc85efbe14fa51c3e7d6ccf70750ea 100644 --- a/mlir/lib/IR/MLIRContext.cpp +++ b/mlir/lib/IR/MLIRContext.cpp @@ -711,6 +711,30 @@ ArrayRef MLIRContext::getRegisteredOperations() { return impl->sortedRegisteredOperations; } +/// Return information for registered operations by dialect. +ArrayRef +MLIRContext::getRegisteredOperationsByDialect(StringRef dialectName) { + auto lowerBound = + std::lower_bound(impl->sortedRegisteredOperations.begin(), + impl->sortedRegisteredOperations.end(), dialectName, + [](auto &lhs, auto &rhs) { + return lhs.getDialect().getNamespace().compare(rhs); + }); + + if (lowerBound == impl->sortedRegisteredOperations.end() || + lowerBound->getDialect().getNamespace() != dialectName) + return ArrayRef(); + + auto upperBound = + std::upper_bound(lowerBound, impl->sortedRegisteredOperations.end(), + dialectName, [](auto &lhs, auto &rhs) { + return lhs.compare(rhs.getDialect().getNamespace()); + }); + + size_t count = std::distance(lowerBound, upperBound); + return ArrayRef(&*lowerBound, count); +} + bool MLIRContext::isOperationRegistered(StringRef name) { return RegisteredOperationName::lookup(name, this).has_value(); } diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp index e52d0e17cda22b98563831653e6c1b7d7a9095ec..8cc4206dae6edf4af87148857c07a83d53847391 100644 --- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp +++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp @@ -247,3 +247,17 @@ LogicalResult mlir::detail::verifyInferredResultTypes(Operation *op) { return result; } + +void mlir::detail::reportFatalInferReturnTypesError(OperationState &state) { + std::string buffer; + llvm::raw_string_ostream os(buffer); + os << "Failed to infer result type(s):\n"; + os << "\"" << state.name << "\"(...) "; + os << state.attributes.getDictionary(state.location.getContext()); + os << " : ("; + llvm::interleaveComma(state.operands, os, + [&](Value val) { os << val.getType(); }); + os << ") -> ( ??? )"; + emitRemark(state.location, "location of op"); + llvm::report_fatal_error(llvm::StringRef(buffer)); +} diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp index d034e576dfc579875837fe4557fcb262e1ed8fb7..4fd043c7c93e687552bb2201f7e990bb826a0d65 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp @@ -68,6 +68,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder, if (isConvertibleIntrinsic(intrinsicID)) { SmallVector args(inst->args()); ArrayRef llvmOperands(args); + + SmallVector llvmOpBundles; + llvmOpBundles.reserve(inst->getNumOperandBundles()); + for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i) + llvmOpBundles.push_back(inst->getOperandBundleAt(i)); + #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc" } diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index a8595d14ccf2e5104c8a1e46942ee54d2e749cf8..2084e527773ca82dd95790dc5f450fe8be0fcaf7 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -114,17 +114,27 @@ convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag, } static SmallVector -convertOperandBundles(OperandRangeRange bundleOperands, - ArrayRef bundleTags, +convertOperandBundles(OperandRangeRange bundleOperands, ArrayAttr bundleTags, LLVM::ModuleTranslation &moduleTranslation) { SmallVector bundles; bundles.reserve(bundleOperands.size()); - for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags)) + for (auto [operands, tagAttr] : llvm::zip_equal(bundleOperands, bundleTags)) { + StringRef tag = cast(tagAttr).getValue(); bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation)); + } return bundles; } +static SmallVector +convertOperandBundles(OperandRangeRange bundleOperands, + std::optional bundleTags, + LLVM::ModuleTranslation &moduleTranslation) { + if (!bundleTags) + return {}; + return convertOperandBundles(bundleOperands, *bundleTags, moduleTranslation); +} + /// Builder for LLVM_CallIntrinsicOp static LogicalResult convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder, diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp index bc830a77f3c580f693b2b1b47ff55c32f1ac8dd6..2c0b665ad0d8339917143b657940c81bc31b5c11 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp @@ -50,6 +50,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder, if (isConvertibleIntrinsic(intrinsicID)) { SmallVector args(inst->args()); ArrayRef llvmOperands(args); + + SmallVector llvmOpBundles; + llvmOpBundles.reserve(inst->getNumOperandBundles()); + for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i) + llvmOpBundles.push_back(inst->getOperandBundleAt(i)); + #include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc" } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 4a575f4e57706263e22d6f11016e07d83263d46e..7c45e89cd8ac4ba03c39486cbb9c4ca0313a0897 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -371,20 +371,46 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder, return success(); } -/// Populates `reductions` with reduction declarations used in the given loop. +/// Looks up from the operation from and returns the PrivateClauseOp with +/// name symbolName +static omp::PrivateClauseOp findPrivatizer(Operation *from, + SymbolRefAttr symbolName) { + omp::PrivateClauseOp privatizer = + SymbolTable::lookupNearestSymbolFrom(from, + symbolName); + assert(privatizer && "privatizer not found in the symbol table"); + return privatizer; +} + +/// Populates `privatizations` with privatization declarations used for the +/// given op. +/// TODO: generalise beyond ParallelOp +static void collectPrivatizationDecls( + omp::ParallelOp op, SmallVectorImpl &privatizations) { + std::optional attr = op.getPrivateSyms(); + if (!attr) + return; + + privatizations.reserve(privatizations.size() + attr->size()); + for (auto symbolRef : attr->getAsRange()) { + privatizations.push_back(findPrivatizer(op, symbolRef)); + } +} + +/// Populates `reductions` with reduction declarations used in the given op. template static void -collectReductionDecls(T loop, +collectReductionDecls(T op, SmallVectorImpl &reductions) { - std::optional attr = loop.getReductionSyms(); + std::optional attr = op.getReductionSyms(); if (!attr) return; - reductions.reserve(reductions.size() + loop.getNumReductionVars()); + reductions.reserve(reductions.size() + op.getNumReductionVars()); for (auto symbolRef : attr->getAsRange()) { reductions.push_back( SymbolTable::lookupNearestSymbolFrom( - loop, symbolRef)); + op, symbolRef)); } } @@ -609,7 +635,7 @@ static LogicalResult allocReductionVars(T loop, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, - llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, SmallVectorImpl &reductionDecls, SmallVectorImpl &privateReductionVariables, DenseMap &reductionVariableMap, @@ -1317,76 +1343,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, privateReductionVariables, isByRef); } -/// A RAII class that on construction replaces the region arguments of the -/// parallel op (which correspond to private variables) with the actual private -/// variables they correspond to. This prepares the parallel op so that it -/// matches what is expected by the OMPIRBuilder. -/// -/// On destruction, it restores the original state of the operation so that on -/// the MLIR side, the op is not affected by conversion to LLVM IR. -class OmpParallelOpConversionManager { -public: - OmpParallelOpConversionManager(omp::ParallelOp opInst) - : region(opInst.getRegion()), - privateBlockArgs(cast(*opInst) - .getPrivateBlockArgs()), - privateVars(opInst.getPrivateVars()) { - for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars)) - mlir::replaceAllUsesInRegionWith(blockArg, var, region); - } - - ~OmpParallelOpConversionManager() { - for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars)) - mlir::replaceAllUsesInRegionWith(var, blockArg, region); - } - -private: - Region ®ion; - llvm::MutableArrayRef privateBlockArgs; - OperandRange privateVars; -}; - -// Looks up from the operation from and returns the PrivateClauseOp with -// name symbolName -static omp::PrivateClauseOp findPrivatizer(Operation *from, - SymbolRefAttr symbolName) { - omp::PrivateClauseOp privatizer = - SymbolTable::lookupNearestSymbolFrom(from, - symbolName); - assert(privatizer && "privatizer not found in the symbol table"); - return privatizer; -} -// clones the given privatizer. The original privatizer is used as -// the insert point for the clone. -static omp::PrivateClauseOp -clonePrivatizer(LLVM::ModuleTranslation &moduleTranslation, - omp::PrivateClauseOp privatizer, Operation *fromOperation) { - MLIRContext &context = moduleTranslation.getContext(); - mlir::IRRewriter opCloner(&context); - opCloner.setInsertionPoint(privatizer); - auto clone = - llvm::cast(opCloner.clone(*privatizer)); - - // Unique the clone name to avoid clashes in the symbol table. - unsigned counter = 0; - SmallString<256> cloneName = SymbolTable::generateSymbolName<256>( - privatizer.getSymName(), - [&](llvm::StringRef candidate) { - return SymbolTable::lookupNearestSymbolFrom( - fromOperation, StringAttr::get(&context, candidate)) != - nullptr; - }, - counter); - - clone.setSymName(cloneName); - return clone; -} /// Converts the OpenMP parallel operation to LLVM IR. static LogicalResult convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - OmpParallelOpConversionManager raii(opInst); ArrayRef isByRef = getIsByRef(opInst.getReductionByref()); assert(isByRef.size() == opInst.getNumReductionVars()); @@ -1395,6 +1356,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, LogicalResult bodyGenStatus = success(); llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + // Collect delayed privatization declarations + MutableArrayRef privateBlockArgs = + cast(*opInst).getPrivateBlockArgs(); + SmallVector llvmPrivateVars; + SmallVector privateDecls; + llvmPrivateVars.reserve(privateBlockArgs.size()); + privateDecls.reserve(privateBlockArgs.size()); + collectPrivatizationDecls(opInst, privateDecls); + // Collect reduction declarations SmallVector reductionDecls; collectReductionDecls(opInst, reductionDecls); @@ -1403,6 +1373,66 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, SmallVector deferredStores; auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { + // Allocate private vars + llvm::BranchInst *allocaTerminator = + llvm::cast(allocaIP.getBlock()->getTerminator()); + builder.SetInsertPoint(allocaTerminator); + assert(allocaTerminator->getNumSuccessors() == 1 && + "This is an unconditional branch created by OpenMPIRBuilder"); + llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0); + + // FIXME: Some of the allocation regions do more than just allocating. + // They read from their block argument (amongst other non-alloca things). + // When OpenMPIRBuilder outlines the parallel region into a different + // function it places the loads for live in-values (such as these block + // arguments) at the end of the entry block (because the entry block is + // assumed to contain only allocas). Therefore, if we put these complicated + // alloc blocks in the entry block, these will not dominate the availability + // of the live-in values they are using. Fix this by adding a latealloc + // block after the entry block to put these in (this also helps to avoid + // mixing non-alloca code with allocas). + // Alloc regions which do not use the block argument can still be placed in + // the entry block (therefore keeping the allocas together). + llvm::BasicBlock *privAllocBlock = nullptr; + if (!privateBlockArgs.empty()) + privAllocBlock = splitBB(builder, true, "omp.private.latealloc"); + for (unsigned i = 0; i < privateBlockArgs.size(); ++i) { + Region &allocRegion = privateDecls[i].getAllocRegion(); + + // map allocation region block argument + llvm::Value *nonPrivateVar = + moduleTranslation.lookupValue(opInst.getPrivateVars()[i]); + assert(nonPrivateVar); + moduleTranslation.mapValue(privateDecls[i].getAllocMoldArg(), + nonPrivateVar); + + // in-place convert the private allocation region + SmallVector phis; + if (privateDecls[i].getAllocMoldArg().getUses().empty()) { + // TODO this should use + // allocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca() so it goes before + // the code for fetching the thread id. Not doing this for now to avoid + // test churn. + builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + } else { + builder.SetInsertPoint(privAllocBlock->getTerminator()); + } + if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc", + builder, moduleTranslation, &phis))) { + bodyGenStatus = failure(); + return; + } + assert(phis.size() == 1 && "expected one allocation to be yielded"); + + moduleTranslation.mapValue(privateBlockArgs[i], phis[0]); + llvmPrivateVars.push_back(phis[0]); + + // clear alloc region block argument mapping in case it needs to be + // re-created with a different source for another use of the same + // reduction decl + moduleTranslation.forgetMapping(allocRegion); + } + // Allocate reduction vars DenseMap reductionVariableMap; @@ -1419,12 +1449,64 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, deferredStores, isByRef))) bodyGenStatus = failure(); + // Apply copy region for firstprivate. + bool needsFirstprivate = + llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) { + return privOp.getDataSharingType() == + omp::DataSharingClauseType::FirstPrivate; + }); + if (needsFirstprivate) { + // Find the end of the allocation blocks + assert(afterAllocas->getSinglePredecessor()); + builder.SetInsertPoint( + afterAllocas->getSinglePredecessor()->getTerminator()); + llvm::BasicBlock *copyBlock = + splitBB(builder, /*CreateBranch=*/true, "omp.private.copy"); + builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca()); + } + for (unsigned i = 0; i < privateBlockArgs.size(); ++i) { + if (privateDecls[i].getDataSharingType() != + omp::DataSharingClauseType::FirstPrivate) + continue; + + // copyRegion implements `lhs = rhs` + Region ©Region = privateDecls[i].getCopyRegion(); + + // map copyRegion rhs arg + llvm::Value *nonPrivateVar = + moduleTranslation.lookupValue(opInst.getPrivateVars()[i]); + assert(nonPrivateVar); + moduleTranslation.mapValue(privateDecls[i].getCopyMoldArg(), + nonPrivateVar); + + // map copyRegion lhs arg + moduleTranslation.mapValue(privateDecls[i].getCopyPrivateArg(), + llvmPrivateVars[i]); + + // in-place convert copy region + builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); + if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy", + builder, moduleTranslation))) { + bodyGenStatus = failure(); + return; + } + + // ignore unused value yielded from copy region + + // clear copy region block argument mapping in case it needs to be + // re-created with different sources for reuse of the same reduction + // decl + moduleTranslation.forgetMapping(copyRegion); + } + // Initialize reduction vars - builder.restoreIP(allocaIP); + builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init"); allocaIP = InsertPointTy(allocaIP.getBlock(), allocaIP.getBlock()->getTerminator()->getIterator()); + + builder.restoreIP(allocaIP); SmallVector byRefVars(opInst.getNumReductionVars()); for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) { if (isByRef[i]) { @@ -1534,183 +1616,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, } }; - SmallVector mlirPrivatizerClones; - SmallVector llvmPrivateVars; - - // TODO: Perform appropriate actions according to the data-sharing - // attribute (shared, private, firstprivate, ...) of variables. - // Currently shared and private are supported. - auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP, - llvm::Value &, llvm::Value &llvmOmpRegionInput, - llvm::Value *&llvmReplacementValue) -> InsertPointTy { - llvmReplacementValue = &llvmOmpRegionInput; - - // If this is a private value, this lambda will return the corresponding - // mlir value and its `PrivateClauseOp`. Otherwise, empty values are - // returned. - auto [mlirPrivVar, mlirPrivatizerClone] = - [&]() -> std::pair { - if (!opInst.getPrivateVars().empty()) { - auto mlirPrivVars = opInst.getPrivateVars(); - auto mlirPrivSyms = opInst.getPrivateSyms(); - - // Try to find a privatizer that corresponds to the LLVM value being - // privatized. - for (auto [mlirPrivVar, mlirPrivatizerAttr] : - llvm::zip_equal(mlirPrivVars, *mlirPrivSyms)) { - // Find the MLIR private variable corresponding to the LLVM value - // being privatized. - llvm::Value *mlirToLLVMPrivVar = - moduleTranslation.lookupValue(mlirPrivVar); - - // Check if the LLVM value being privatized matches the LLVM value - // mapped to privVar. In some cases, this is not trivial ... - auto isMatch = [&]() { - if (mlirToLLVMPrivVar == nullptr) - return false; - - // If both values are trivially equal, we found a match. - if (mlirToLLVMPrivVar == &llvmOmpRegionInput) - return true; - - // Otherwise, we check if both llvmOmpRegionInputPtr and - // mlirToLLVMPrivVar refer to the same memory (through a load/store - // pair). This happens if a struct (i.e. multi-field value) is being - // privatized. - // - // For example, if the privatized value is defined by: - // ``` - // %priv_val = alloca { ptr, i64 }, align 8 - // ``` - // - // The initialization of this value (outside the omp region) will be - // something like this: - // - // clang-format off - // ``` - // %partially_init_priv_val = insertvalue { ptr, i64 } undef, - // ptr %some_ptr, 0 - // %fully_init_priv_val = insertvalue { ptr, i64 } %partially_init_priv_val, - // i64 %some_i64, 1 - // ... - // store { ptr, i64 } %fully_init_priv_val, ptr %priv_val, align 8 - // ``` - // clang-format on - // - // Now, we enter the OMP region, in order to access this privatized - // value, we need to load from the allocated memory: - // ``` - // omp.par.entry: - // %priv_val_load = load { ptr, i64 }, ptr %priv_val, align 8 - // ``` - // - // The 2 LLVM values tracked here map as follows: - // - `mlirToLLVMPrivVar` -> `%fully_init_priv_val` - // - `llvmOmpRegionInputPtr` -> `%priv_val_load` - // - // Even though they eventually refer to the same memory reference - // (the memory being privatized), they are 2 distinct LLVM values. - // Therefore, we need to discover their correspondence by finding - // out if they store into and load from the same mem ref. - auto *llvmOmpRegionInputPtrLoad = - llvm::dyn_cast_if_present(&llvmOmpRegionInput); - - if (llvmOmpRegionInputPtrLoad == nullptr) - return false; - - for (auto &use : mlirToLLVMPrivVar->uses()) { - auto *mlirToLLVMPrivVarStore = - llvm::dyn_cast_if_present(use.getUser()); - if (mlirToLLVMPrivVarStore && - (llvmOmpRegionInputPtrLoad->getPointerOperand() == - mlirToLLVMPrivVarStore->getPointerOperand())) - return true; - } - - return false; - }; - - if (!isMatch()) - continue; - - SymbolRefAttr privSym = llvm::cast(mlirPrivatizerAttr); - omp::PrivateClauseOp privatizer = findPrivatizer(opInst, privSym); - - // Clone the privatizer in case it is used by more than one parallel - // region. The privatizer is processed in-place (see below) before it - // gets inlined in the parallel region and therefore processing the - // original op is dangerous. - return {mlirPrivVar, - clonePrivatizer(moduleTranslation, privatizer, opInst)}; - } - } - - return {mlir::Value(), omp::PrivateClauseOp()}; - }(); - - if (mlirPrivVar) { - Region &allocRegion = mlirPrivatizerClone.getAllocRegion(); - - // If this is a `firstprivate` clause, prepare the `omp.private` op by: - if (mlirPrivatizerClone.getDataSharingType() == - omp::DataSharingClauseType::FirstPrivate) { - auto oldAllocBackBlock = std::prev(allocRegion.end()); - omp::YieldOp oldAllocYieldOp = - llvm::cast(oldAllocBackBlock->getTerminator()); - - Region ©Region = mlirPrivatizerClone.getCopyRegion(); - - mlir::IRRewriter copyCloneBuilder(&moduleTranslation.getContext()); - // 1. Cloning the `copy` region to the end of the `alloc` region. - copyCloneBuilder.cloneRegionBefore(copyRegion, allocRegion, - allocRegion.end()); - - auto newCopyRegionFrontBlock = std::next(oldAllocBackBlock); - // 2. Merging the last `alloc` block with the first block in the `copy` - // region clone. - // 3. Re-mapping the first argument of the `copy` region to be the - // argument of the `alloc` region and the second argument of the `copy` - // region to be the yielded value of the `alloc` region (this is the - // private clone of the privatized value). - copyCloneBuilder.mergeBlocks(&*newCopyRegionFrontBlock, - &*oldAllocBackBlock, - {mlirPrivatizerClone.getAllocMoldArg(), - oldAllocYieldOp.getOperand(0)}); - - // 4. The old terminator of the `alloc` region is not needed anymore, so - // delete it. - oldAllocYieldOp.erase(); - } - - // Replace the privatizer block argument with mlir value being privatized. - // This way, the body of the privatizer will be changed from using the - // region/block argument to the value being privatized. - replaceAllUsesInRegionWith(mlirPrivatizerClone.getAllocMoldArg(), - mlirPrivVar, allocRegion); - - auto oldIP = builder.saveIP(); - builder.restoreIP(allocaIP); - - SmallVector yieldedValues; - if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder, - moduleTranslation, &yieldedValues))) { - opInst.emitError("failed to inline `alloc` region of an `omp.private` " - "op in the parallel region"); - bodyGenStatus = failure(); - mlirPrivatizerClone.erase(); - } else { - assert(yieldedValues.size() == 1); - llvmReplacementValue = yieldedValues.front(); - - // Keep the LLVM replacement value and the op clone in case we need to - // emit cleanup (i.e. deallocation) logic. - llvmPrivateVars.push_back(llvmReplacementValue); - mlirPrivatizerClones.push_back(mlirPrivatizerClone); - } - - builder.restoreIP(oldIP); - } - + auto privCB = [](InsertPointTy allocaIP, InsertPointTy codeGenIP, + llvm::Value &, llvm::Value &val, llvm::Value *&replVal) { + // tell OpenMPIRBuilder not to do anything. We handled Privatisation in + // bodyGenCB. + replVal = &val; return codeGenIP; }; @@ -1733,8 +1643,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, bodyGenStatus = failure(); SmallVector privateCleanupRegions; - llvm::transform(mlirPrivatizerClones, - std::back_inserter(privateCleanupRegions), + llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions), [](omp::PrivateClauseOp privatizer) { return &privatizer.getDeallocRegion(); }); @@ -1767,9 +1676,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB, ifCond, numThreads, pbKind, isCancellable)); - for (mlir::omp::PrivateClauseOp privatizerClone : mlirPrivatizerClones) - privatizerClone.erase(); - return bodyGenStatus; } @@ -1785,6 +1691,20 @@ convertOrderKind(std::optional o) { llvm_unreachable("Unknown ClauseOrderKind kind"); } +static LogicalResult simdOpSupported(omp::SimdOp op) { + if (!op.getLinearVars().empty() || !op.getLinearStepVars().empty()) + return op.emitError("linear clause not yet supported"); + + if (!op.getPrivateVars().empty() || op.getPrivateSyms()) + return op.emitError("privatization clauses not yet supported"); + + if (!op.getReductionVars().empty() || op.getReductionByref() || + op.getReductionSyms()) + return op.emitError("reduction clause not yet supported"); + + return success(); +} + /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, @@ -1792,11 +1712,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, auto simdOp = cast(opInst); auto loopOp = cast(simdOp.getWrappedLoop()); - if (!simdOp.getLinearVars().empty() || !simdOp.getLinearStepVars().empty() || - !simdOp.getPrivateVars().empty() || simdOp.getPrivateSyms() || - !simdOp.getReductionVars().empty() || simdOp.getReductionByref() || - simdOp.getReductionSyms()) - return opInst.emitError("unhandled clauses for translation to LLVM IR"); + if (failed(simdOpSupported(simdOp))) + return failure(); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index bd861f3a69e53c73963ecf11e4566b607075aea9..6e97b2a50af8a11836274d77931be8c30e441184 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1311,7 +1311,8 @@ ModuleImport::convertValues(ArrayRef values) { } LogicalResult ModuleImport::convertIntrinsicArguments( - ArrayRef values, ArrayRef immArgPositions, + ArrayRef values, ArrayRef opBundles, + bool requiresOpBundles, ArrayRef immArgPositions, ArrayRef immArgAttrNames, SmallVectorImpl &valuesOut, SmallVectorImpl &attrsOut) { assert(immArgPositions.size() == immArgAttrNames.size() && @@ -1341,6 +1342,35 @@ LogicalResult ModuleImport::convertIntrinsicArguments( valuesOut.push_back(*mlirValue); } + SmallVector opBundleSizes; + SmallVector opBundleTagAttrs; + if (requiresOpBundles) { + opBundleSizes.reserve(opBundles.size()); + opBundleTagAttrs.reserve(opBundles.size()); + + for (const llvm::OperandBundleUse &bundle : opBundles) { + opBundleSizes.push_back(bundle.Inputs.size()); + opBundleTagAttrs.push_back(StringAttr::get(context, bundle.getTagName())); + + for (const llvm::Use &opBundleOperand : bundle.Inputs) { + auto operandMlirValue = convertValue(opBundleOperand.get()); + if (failed(operandMlirValue)) + return failure(); + valuesOut.push_back(*operandMlirValue); + } + } + + auto opBundleSizesAttr = DenseI32ArrayAttr::get(context, opBundleSizes); + auto opBundleSizesAttrNameAttr = + StringAttr::get(context, LLVMDialect::getOpBundleSizesAttrName()); + attrsOut.push_back({opBundleSizesAttrNameAttr, opBundleSizesAttr}); + + auto opBundleTagsAttr = ArrayAttr::get(context, opBundleTagAttrs); + auto opBundleTagsAttrNameAttr = + StringAttr::get(context, LLVMDialect::getOpBundleTagsAttrName()); + attrsOut.push_back({opBundleTagsAttrNameAttr, opBundleTagsAttr}); + } + return success(); } diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 6e005f9ec5df85cb161cf40d7ab53cb4d75cdba7..ceb8ba3b33818bf0056ca676bd0609de1d97360e 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -55,6 +55,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include #include #define DEBUG_TYPE "llvm-dialect-to-llvm-ir" @@ -854,8 +855,40 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall( "LLVM `immArgPositions` and MLIR `immArgAttrNames` should have equal " "length"); + SmallVector opBundles; + size_t numOpBundleOperands = 0; + auto opBundleSizesAttr = cast_if_present( + intrOp->getAttr(LLVMDialect::getOpBundleSizesAttrName())); + auto opBundleTagsAttr = cast_if_present( + intrOp->getAttr(LLVMDialect::getOpBundleTagsAttrName())); + + if (opBundleSizesAttr && opBundleTagsAttr) { + ArrayRef opBundleSizes = opBundleSizesAttr.asArrayRef(); + assert(opBundleSizes.size() == opBundleTagsAttr.size() && + "operand bundles and tags do not match"); + + numOpBundleOperands = + std::accumulate(opBundleSizes.begin(), opBundleSizes.end(), size_t(0)); + assert(numOpBundleOperands <= intrOp->getNumOperands() && + "operand bundle operands is more than the number of operands"); + + ValueRange operands = intrOp->getOperands().take_back(numOpBundleOperands); + size_t nextOperandIdx = 0; + opBundles.reserve(opBundleSizesAttr.size()); + + for (auto [opBundleTagAttr, bundleSize] : + llvm::zip(opBundleTagsAttr, opBundleSizes)) { + auto bundleTag = cast(opBundleTagAttr).str(); + auto bundleOperands = moduleTranslation.lookupValues( + operands.slice(nextOperandIdx, bundleSize)); + opBundles.emplace_back(std::move(bundleTag), std::move(bundleOperands)); + nextOperandIdx += bundleSize; + } + } + // Map operands and attributes to LLVM values. - auto operands = moduleTranslation.lookupValues(intrOp->getOperands()); + auto opOperands = intrOp->getOperands().drop_back(numOpBundleOperands); + auto operands = moduleTranslation.lookupValues(opOperands); SmallVector args(immArgPositions.size() + operands.size()); for (auto [immArgPos, immArgName] : llvm::zip(immArgPositions, immArgAttrNames)) { @@ -890,7 +923,7 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall( llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration( module, intrinsic, overloadedTypes); - return builder.CreateCall(llvmIntr, args); + return builder.CreateCall(llvmIntr, args, opBundles); } /// Given a single MLIR operation, create the corresponding LLVM IR operation diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp index 3de4fb75ed831c50169977caa152429933c7ba58..7e45f18b660ba7bbbcbdcae209796c987c078155 100644 --- a/mlir/lib/Transforms/RemoveDeadValues.cpp +++ b/mlir/lib/Transforms/RemoveDeadValues.cpp @@ -589,7 +589,7 @@ void RemoveDeadValues::runOnOperation() { }); if (acceptableIR.wasInterrupted()) - return; + return signalPassFailure(); module->walk([&](Operation *op) { if (auto funcOp = dyn_cast(op)) { diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py index 41051c0d5b2ffb642729ca13ec1ce86d039e4d06..f6111f516f8c3d80157d940c8421eb6dde704d6e 100644 --- a/mlir/python/mlir/dialects/transform/structured.py +++ b/mlir/python/mlir/dialects/transform/structured.py @@ -377,7 +377,7 @@ class PadOp(PadOp): pad_to_multiple_of: Optional[Union[DynamicIndexList, ArrayAttr]] = None, padding_values: Optional[Union[ArrayAttr, Sequence[Attribute]]] = None, padding_dimensions: OptionalIntList = None, - pack_paddings: OptionalIntList = None, + nofold_flags: OptionalIntList = None, transpose_paddings: Optional[ Union[ArrayAttr, Sequence[Union[ArrayAttr, IntOrAttrList]]] ] = None, @@ -407,7 +407,7 @@ class PadOp(PadOp): padding_values=padding_values, padding_dimensions=padding_dimensions, static_pad_to_multiple_of=static_pad_to_multiple_of, - pack_paddings=pack_paddings, + nofold_flags=nofold_flags, transpose_paddings=transpose_paddings, copy_back_op=copy_back_op, loc=loc, diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir index d3bdbe89a548764b3ef0b18755920c6ca33b3350..64c40f1aba43bc18865d8735675ceff1b6ed84f0 100644 --- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir +++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir @@ -46,33 +46,45 @@ func.func @ops(f32, f32, i32, i32, f64) -> (f32, i32) { %1 = arith.subi %arg2, %arg3: i32 // CHECK: = llvm.icmp "slt" %arg2, %1 : i32 %2 = arith.cmpi slt, %arg2, %1 : i32 +// CHECK: = llvm.icmp "sle" %arg2, %1 : i32 + %3 = arith.cmpi sle, %arg2, %1 : i32 +// CHECK: = llvm.icmp "sgt" %arg2, %1 : i32 + %4 = arith.cmpi sgt, %arg2, %1 : i32 +// CHECK: = llvm.icmp "ult" %arg2, %1 : i32 + %5 = arith.cmpi ult, %arg2, %1 : i32 +// CHECK: = llvm.icmp "ule" %arg2, %1 : i32 + %6 = arith.cmpi ule, %arg2, %1 : i32 +// CHECK: = llvm.icmp "ugt" %arg2, %1 : i32 + %7 = arith.cmpi ugt, %arg2, %1 : i32 +// CHECK: = llvm.icmp "eq" %arg2, %1 : i32 + %8 = arith.cmpi eq, %arg2, %1 : i32 // CHECK: = llvm.sdiv %arg2, %arg3 : i32 - %3 = arith.divsi %arg2, %arg3 : i32 + %9 = arith.divsi %arg2, %arg3 : i32 // CHECK: = llvm.udiv %arg2, %arg3 : i32 - %4 = arith.divui %arg2, %arg3 : i32 + %10 = arith.divui %arg2, %arg3 : i32 // CHECK: = llvm.srem %arg2, %arg3 : i32 - %5 = arith.remsi %arg2, %arg3 : i32 + %11 = arith.remsi %arg2, %arg3 : i32 // CHECK: = llvm.urem %arg2, %arg3 : i32 - %6 = arith.remui %arg2, %arg3 : i32 + %12 = arith.remui %arg2, %arg3 : i32 // CHECK: = llvm.fdiv %arg0, %arg1 : f32 - %8 = arith.divf %arg0, %arg1 : f32 + %13 = arith.divf %arg0, %arg1 : f32 // CHECK: = llvm.frem %arg0, %arg1 : f32 - %9 = arith.remf %arg0, %arg1 : f32 + %14 = arith.remf %arg0, %arg1 : f32 // CHECK: = llvm.and %arg2, %arg3 : i32 - %10 = arith.andi %arg2, %arg3 : i32 + %15 = arith.andi %arg2, %arg3 : i32 // CHECK: = llvm.or %arg2, %arg3 : i32 - %11 = arith.ori %arg2, %arg3 : i32 + %16 = arith.ori %arg2, %arg3 : i32 // CHECK: = llvm.xor %arg2, %arg3 : i32 - %12 = arith.xori %arg2, %arg3 : i32 + %17 = arith.xori %arg2, %arg3 : i32 // CHECK: = llvm.mlir.constant(7.900000e-01 : f64) : f64 - %15 = arith.constant 7.9e-01 : f64 + %18 = arith.constant 7.9e-01 : f64 // CHECK: = llvm.shl %arg2, %arg3 : i32 - %16 = arith.shli %arg2, %arg3 : i32 + %19 = arith.shli %arg2, %arg3 : i32 // CHECK: = llvm.ashr %arg2, %arg3 : i32 - %17 = arith.shrsi %arg2, %arg3 : i32 + %20 = arith.shrsi %arg2, %arg3 : i32 // CHECK: = llvm.lshr %arg2, %arg3 : i32 - %18 = arith.shrui %arg2, %arg3 : i32 - return %0, %4 : f32, i32 + %21 = arith.shrui %arg2, %arg3 : i32 + return %0, %10 : f32, i32 } // Checking conversion of index types to integers using i1, assuming no target diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir index b86103422b07454a7d9783074dbe90ad724013bd..55b1bc9c545a8554b0b3fd23e7f3b625f339a0fe 100644 --- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir @@ -684,7 +684,7 @@ func.func @collapse_static_shape_with_non_identity_layout(%arg: memref<1x1x8x8xf // CHECK: %[[INT_TO_PTR:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64 // CHECK: %[[AND:.*]] = llvm.and %[[INT_TO_PTR]], {{.*}} : i64 // CHECK: %[[CMP:.*]] = llvm.icmp "eq" %[[AND]], {{.*}} : i64 -// CHECK: "llvm.intr.assume"(%[[CMP]]) : (i1) -> () +// CHECK: llvm.intr.assume %[[CMP]] : i1 // CHECK: %[[LD_ADDR:.*]] = llvm.getelementptr %[[BUFF_ADDR]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32 // CHECK: %[[VAL:.*]] = llvm.load %[[LD_ADDR]] : !llvm.ptr -> f32 // CHECK: return %[[VAL]] : f32 diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir index 9dc22abf143bf0542262266e65c6ffaedf8d62fe..48dc9079333d4f5c2e6d7f9189db89a6359d7d6e 100644 --- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir @@ -160,7 +160,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) { // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i64 // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64 // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64 - // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> () + // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1 memref.assume_alignment %0, 16 : memref<4x4xf16> return } @@ -177,7 +177,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64 // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64 // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64 - // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> () + // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1 memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>> return } diff --git a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir index 9cfd6885fba95870a435e24d386d0ef52e78e4e1..8f01cc2b8d44c3fdb22e26d6c1fab45fd884174c 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -convert-vector-to-llvm -split-input-file | FileCheck -D$IDX_TYPE=i32 %s -// RUN: mlir-opt %s --convert-vector-to-llvm='force-32bit-vector-indices=0' | FileCheck -D$IDX_TYPE=i64 %s +// RUN: mlir-opt %s -convert-vector-to-llvm -split-input-file | FileCheck '-D$IDX_TYPE=i32' %s +// RUN: mlir-opt %s --convert-vector-to-llvm='force-32bit-vector-indices=0' | FileCheck '-D$IDX_TYPE=i64' %s func.func @transfer_read_write_1d(%A : memref, %base: index) -> vector<17xf32> { %f7 = arith.constant 7.0: f32 diff --git a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir index 49bd74cfe9124a4d8b43bb3d3540432b99e1d989..0ea69de8b8f9af391388c24c483a8791b4656ab1 100644 --- a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir +++ b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir @@ -12,7 +12,7 @@ // CHECK: arith.cmpi slt // CHECK: arith.cmpi sge // CHECK: arith.cmpi sgt -func.func @not_with_maybe_overflow(%arg0 : i32) { +func.func @not_with_maybe_overflow(%arg0 : i32) -> (i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1) { %ci32_smax = arith.constant 0x7fffffff : i32 %c1 = arith.constant 1 : i32 %c4 = arith.constant 4 : i32 @@ -29,7 +29,7 @@ func.func @not_with_maybe_overflow(%arg0 : i32) { %10 = arith.cmpi slt, %1, %c4 : i32 %11 = arith.cmpi sge, %1, %c4 : i32 %12 = arith.cmpi sgt, %1, %c4 : i32 - func.return + func.return %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12 : i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1 } // CHECK-LABEL: func @yes_with_no_overflow @@ -44,7 +44,7 @@ func.func @not_with_maybe_overflow(%arg0 : i32) { // CHECK: arith.cmpi ult // CHECK: arith.cmpi uge // CHECK: arith.cmpi ugt -func.func @yes_with_no_overflow(%arg0 : i32) { +func.func @yes_with_no_overflow(%arg0 : i32) -> (i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1) { %ci32_almost_smax = arith.constant 0x7ffffffe : i32 %c1 = arith.constant 1 : i32 %c4 = arith.constant 4 : i32 @@ -61,7 +61,7 @@ func.func @yes_with_no_overflow(%arg0 : i32) { %10 = arith.cmpi slt, %1, %c4 : i32 %11 = arith.cmpi sge, %1, %c4 : i32 %12 = arith.cmpi sgt, %1, %c4 : i32 - func.return + func.return %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12 : i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1 } // CHECK-LABEL: func @preserves_structure @@ -90,20 +90,20 @@ func.func @preserves_structure(%arg0 : memref<8xindex>) { func.func private @external() -> i8 // CHECK-LABEL: @dead_code -func.func @dead_code() { +func.func @dead_code() -> i8 { %0 = call @external() : () -> i8 // CHECK: arith.floordivsi %1 = arith.floordivsi %0, %0 : i8 - return + return %1 : i8 } // Make sure not crash. // CHECK-LABEL: @no_integer_or_index -func.func @no_integer_or_index() { +func.func @no_integer_or_index(%arg0: vector<1xi32>) -> vector<1xi1> { // CHECK: arith.cmpi %cst_0 = arith.constant dense<[0]> : vector<1xi32> - %cmp = arith.cmpi slt, %cst_0, %cst_0 : vector<1xi32> - return + %cmp = arith.cmpi slt, %cst_0, %arg0 : vector<1xi32> + return %cmp : vector<1xi1> } // CHECK-LABEL: @gpu_func @@ -113,4 +113,4 @@ func.func @gpu_func(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>, %arg2: mem gpu.terminator } return %arg1 : memref<2x32xf32> -} +} diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir index f9551e311df59f0d3b4538be723bd814c76be3a8..0b7ca3f2bb048a467f9b7d8cdd4f282f170573b1 100644 --- a/mlir/test/Dialect/LLVMIR/inlining.mlir +++ b/mlir/test/Dialect/LLVMIR/inlining.mlir @@ -18,7 +18,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 { "llvm.intr.memset"(%ptr, %byte, %0) <{isVolatile = true}> : (!llvm.ptr, i8, i32) -> () "llvm.intr.memmove"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> () "llvm.intr.memcpy"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> () - "llvm.intr.assume"(%true) : (i1) -> () + llvm.intr.assume %true : i1 llvm.fence release %2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32 %3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32 @@ -44,7 +44,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 { // CHECK: "llvm.intr.memset"(%[[PTR]] // CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]] // CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]] -// CHECK: "llvm.intr.assume" +// CHECK: llvm.intr.assume // CHECK: llvm.fence release // CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic // CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 397d66d92bc5d5b4276a43270e142ce380ffd70d..4afa839aa3ea1b7ef6cd9f31a0bc054f9aac0136 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -41,6 +41,18 @@ func.func @rocdl.sched_barrier() { llvm.return } +func.func @rocdl_sched_group_barrier() { + // CHECK: rocdl.sched.group.barrier + rocdl.sched.group.barrier 8, 1, 0 + llvm.return +} + +func.func @rocdl_iglp_opt() { + // CHECK: rocdl.iglp.opt + rocdl.iglp.opt 0 + llvm.return +} + func.func @rocdl.setprio() { // CHECK: rocdl.s.setprio rocdl.s.setprio 0 diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index 3062cdc38c0abb800adc0392d7b3c10c0a37dddb..b8ce7db795a1d171c7569b852a53e5011e8cbf42 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -836,3 +836,30 @@ llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) { llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> () llvm.return } + +// CHECK-LABEL: @test_assume_intr_no_opbundle +llvm.func @test_assume_intr_no_opbundle(%arg0 : !llvm.ptr) { + %0 = llvm.mlir.constant(1 : i1) : i1 + // CHECK: llvm.intr.assume %0 : i1 + llvm.intr.assume %0 : i1 + llvm.return +} + +// CHECK-LABEL: @test_assume_intr_empty_opbundle +llvm.func @test_assume_intr_empty_opbundle(%arg0 : !llvm.ptr) { + %0 = llvm.mlir.constant(1 : i1) : i1 + // CHECK: llvm.intr.assume %0 : i1 + llvm.intr.assume %0 [] : i1 + llvm.return +} + +// CHECK-LABEL: @test_assume_intr_with_opbundles +llvm.func @test_assume_intr_with_opbundles(%arg0 : !llvm.ptr) { + %0 = llvm.mlir.constant(1 : i1) : i1 + %1 = llvm.mlir.constant(2 : i32) : i32 + %2 = llvm.mlir.constant(3 : i32) : i32 + %3 = llvm.mlir.constant(4 : i32) : i32 + // CHECK: llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1 + llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1 + llvm.return +} diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir index 9c223737750a9b8963d35b1e2e45794c522b4b36..8a3bb1bc52dc5d21d11d19e3bf72dcb867ab71f5 100644 --- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir +++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir @@ -59,7 +59,7 @@ module attributes {transform.with_named_sequence} { // Pad linalg.matmul. %padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op {padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], - padding_dimensions=[0, 1, 2], pack_paddings=[1, 1, 1], + padding_dimensions=[0, 1, 2], nofold_flags=[1, 1, 1], copy_back_op = "linalg.copy"} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) @@ -180,7 +180,7 @@ module attributes {transform.with_named_sequence} { // Pad linalg.matmul. %padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op {padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], - padding_dimensions=[0, 1, 2], pack_paddings=[1, 1, 1], + padding_dimensions=[0, 1, 2], nofold_flags=[1, 1, 1], copy_back_op = "linalg.copy"} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) diff --git a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir index 5e5657980ba120feef947f4b8f77a1e11099f6fc..373ed5f0d7908a1f135ba60064a2acab77ddc490 100644 --- a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir +++ b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir @@ -54,7 +54,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 1] + nofold_flags=[1, 1, 1] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %buffer, %new_ops = transform.structured.bufferize_to_allocation %pad {memory_space = 3, emit_dealloc} : !transform.any_op %2 = transform.bufferization.one_shot_bufferize %arg1 {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op @@ -115,7 +115,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 1] + nofold_flags=[1, 1, 1] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.structured.vectorize %pad vector_sizes [10, 12] : !transform.any_op %vector_write = transform.structured.match ops{["vector.transfer_write"]} in %arg1 : (!transform.any_op) -> !transform.any_op diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir index 120a525f3bdae9c41a2016e86e689cc8596c062a..ab2711545405ea6778051a1c8eab24e813dbaae4 100644 --- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir @@ -39,7 +39,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 0] + nofold_flags=[1, 1, 0] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.op<"bufferization.materialize_in_destination">) %p = transform.num_associations %copy_back : (!transform.op<"bufferization.materialize_in_destination">) -> !transform.param // expected-remark @below {{1}} @@ -76,7 +76,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 pad_to_multiple_of [2, 2, 1] { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 0] + nofold_flags=[1, 1, 0] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -112,7 +112,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 pad_to_multiple_of [%c2, 2, 1] { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 0] + nofold_flags=[1, 1, 0] } : (!transform.any_op, !transform.param) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -155,7 +155,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 0] + nofold_flags=[1, 1, 0] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -178,7 +178,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0: i32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 0] + nofold_flags=[1, 1, 0] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -201,7 +201,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=["{foo}", 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 0] + nofold_flags=[1, 1, 0] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -210,7 +210,7 @@ module attributes {transform.with_named_sequence} { // ----- // With all padded being static, there's nothing to pad. However, with the -// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are +// `nofold` attribute set (see `nofold_flags`), the corresponding pad Ops are // preserved. // CHECK-LABEL: @zero_pad_static( @@ -239,7 +239,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 0] + nofold_flags=[1, 1, 0] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -248,7 +248,7 @@ module attributes {transform.with_named_sequence} { // ----- // With all padded dims being static, there's nothing to pad. However, with the -// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are +// `nofold` attribute set (see `nofold_flags`), the corresponding pad Ops are // preserved. Same as above, but some dims are now dynamic. // CHECK-LABEL: @zero_pad_dynamic( @@ -278,7 +278,7 @@ module attributes {transform.with_named_sequence} { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], // Note - only the static dim is padded padding_dimensions=[2], - pack_paddings=[1, 1, 1] + nofold_flags=[1, 1, 1] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -305,7 +305,7 @@ module attributes {transform.with_named_sequence} { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], // Note - attempting to pad non-static dim padding_dimensions=[1], - pack_paddings=[1, 1, 1] + nofold_flags=[1, 1, 1] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -362,7 +362,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 1] + nofold_flags=[1, 1, 1] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -414,7 +414,7 @@ module attributes {transform.with_named_sequence} { %padded, %pad, %copy_back = transform.structured.pad %0 { padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], padding_dimensions=[0, 1, 2], - pack_paddings=[1, 1, 1] + nofold_flags=[1, 1, 1] } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } diff --git a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir index e86d4962530a9a6dd1f375b1cabcbc2a8bf6e81e..a30b56c7c58e8fa121084540a13fd6d120f8c997 100644 --- a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir +++ b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir @@ -18,8 +18,8 @@ transform.sequence failures(propagate) { transform.sequence failures(propagate) { ^bb0(%arg0: !transform.any_op): - // expected-error@below {{expects pack_paddings to contain booleans (0/1), found [1, 7]}} - transform.structured.pad %arg0 {pack_paddings=[1, 7]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + // expected-error@below {{expects nofold_flags to contain booleans (0/1), found [1, 7]}} + transform.structured.pad %arg0 {nofold_flags=[1, 7]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) } // ----- diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir index 540da239fced0879db15369181f83997490598e7..1d6cbfa343ba5d37e83191b22c39adb635de469e 100644 --- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir +++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir @@ -203,7 +203,6 @@ func.func @memref_subview_dynamic_offset_i4(%idx : index) -> i4 { // ----- - func.func @negative_memref_subview_non_contiguous(%idx : index) -> i4 { %c0 = arith.constant 0 : index %arr = memref.alloc() : memref<40x40xi4> @@ -543,13 +542,15 @@ func.func @memref_copy_i4(%arg0: memref<32x128xi4, 1>, %arg1: memref<32x128xi4>) // ----- -!colMajor = memref<8x8xi4, strided<[1, 8]>> -func.func @copy_distinct_layouts(%idx : index) -> i4 { - %c0 = arith.constant 0 : index - %arr = memref.alloc() : memref<8x8xi4> - %arr2 = memref.alloc() : !colMajor - // expected-error @+1 {{failed to legalize operation 'memref.copy' that was explicitly marked illegal}} - memref.copy %arr, %arr2 : memref<8x8xi4> to !colMajor - %ld = memref.load %arr2[%c0, %c0] : !colMajor - return %ld : i4 +func.func @alloc_non_contiguous() { + // expected-error @+1 {{failed to legalize operation 'memref.alloc' that was explicitly marked illegal}} + %arr = memref.alloc() : memref<8x8xi4, strided<[1, 8]>> + return +} + +// ----- + +// expected-error @+1 {{failed to legalize operation 'func.func' that was explicitly marked illegal}} +func.func @argument_non_contiguous(%arg0 : memref<8x8xi4, strided<[1, 8]>>) { + return } diff --git a/mlir/test/Dialect/MemRef/emulate-wide-int.mlir b/mlir/test/Dialect/MemRef/emulate-wide-int.mlir index 65ac5beed0a1ded461b411a747669f9925cbd6b9..994e400bd73c1bbc2ba23da7eebe3fa4f305bd88 100644 --- a/mlir/test/Dialect/MemRef/emulate-wide-int.mlir +++ b/mlir/test/Dialect/MemRef/emulate-wide-int.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-opt --memref-emulate-wide-int="widest-int-supported=32" %s | FileCheck %s +// RUN: mlir-opt --memref-emulate-wide-int="widest-int-supported=32" %s \ +// RUN: --split-input-file --verify-diagnostics | FileCheck %s // Expect no conversions, i32 is supported. // CHECK-LABEL: func @memref_i32 @@ -15,6 +16,8 @@ func.func @memref_i32() { return } +// ----- + // Expect no conversions, f64 is not an integer type. // CHECK-LABEL: func @memref_f32 // CHECK: [[M:%.+]] = memref.alloc() : memref<4xf32, 1> @@ -30,6 +33,8 @@ func.func @memref_f32() { return } +// ----- + // CHECK-LABEL: func @alloc_load_store_i64 // CHECK: [[C1:%.+]] = arith.constant dense<[1, 0]> : vector<2xi32> // CHECK-NEXT: [[M:%.+]] = memref.alloc() : memref<4xvector<2xi32>, 1> @@ -45,6 +50,7 @@ func.func @alloc_load_store_i64() { return } +// ----- // CHECK-LABEL: func @alloc_load_store_i64_nontemporal // CHECK: [[C1:%.+]] = arith.constant dense<[1, 0]> : vector<2xi32> @@ -60,3 +66,30 @@ func.func @alloc_load_store_i64_nontemporal() { memref.store %c1, %m[%c0] {nontemporal = true} : memref<4xi64, 1> return } + +// ----- + +// Make sure we do not crash on unsupported types. +func.func @alloc_i128() { + // expected-error@+1 {{failed to legalize operation 'memref.alloc' that was explicitly marked illegal}} + %m = memref.alloc() : memref<4xi128, 1> + return +} + +// ----- + +func.func @load_i128(%m: memref<4xi128, 1>) { + %c0 = arith.constant 0 : index + // expected-error@+1 {{failed to legalize operation 'memref.load' that was explicitly marked illegal}} + %v = memref.load %m[%c0] : memref<4xi128, 1> + return +} + +// ----- + +func.func @store_i128(%c1: i128, %m: memref<4xi128, 1>) { + %c0 = arith.constant 0 : index + // expected-error@+1 {{failed to legalize operation 'memref.store' that was explicitly marked illegal}} + memref.store %c1, %m[%c0] : memref<4xi128, 1> + return +} diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index f7a87713aca35191aecae008e0a904580c9b7bfe..fd89ec31c64a6001ef7bbcaf6a3568e83ff04a35 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -136,9 +136,11 @@ func.func @invalid_nested_wrapper(%lb : index, %ub : index, %step : index) { // expected-error @below {{only supported nested wrapper is 'omp.simd'}} omp.wsloop { omp.distribute { - omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { - omp.yield - } + omp.simd { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } + } {omp.composite} } {omp.composite} } {omp.composite} } @@ -1975,7 +1977,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { omp.yield } - } {omp.composite} + } } {omp.composite} return } @@ -2188,7 +2190,7 @@ func.func @omp_distribute_nested_wrapper2(%lb: index, %ub: index, %step: index) omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { "omp.yield"() : () -> () } - }) {omp.composite} : () -> () + }) : () -> () } {omp.composite} } diff --git a/mlir/test/Dialect/SCF/loop-pipelining.mlir b/mlir/test/Dialect/SCF/loop-pipelining.mlir index af49d2afc049ba807618626a015c4a2023fa4b52..c879c83275bf86503276b5e8c28816e61c85d4d7 100644 --- a/mlir/test/Dialect/SCF/loop-pipelining.mlir +++ b/mlir/test/Dialect/SCF/loop-pipelining.mlir @@ -767,6 +767,7 @@ func.func @stage_0_value_escape(%A: memref, %result: memref, %ub: // Check for predicated epilogue for dynamic loop. // CHECK-LABEL: dynamic_loop( // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[CM1:.*]] = arith.constant -1 : index // CHECK: %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}} @@ -779,32 +780,32 @@ func.func @stage_0_value_escape(%A: memref, %result: memref, %ub: // CHECK: scf.yield %[[ADDF_24]], %[[LOAD_27]] // CHECK: } // CHECK: %[[CMPI_10:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]] -// CHECK: %[[SEL_10:.*]] = arith.select %[[CMPI_10]], %[[C1]], %[[CM1]] -// CHECK: %[[SUBI_10:.*]] = arith.subi %[[UB]], %[[LB]] -// CHECK: %[[ADDI_11:.*]] = arith.addi %[[SUBI_10]], %[[STEP]] -// CHECK: %[[ADDI_12:.*]] = arith.addi %[[ADDI_11]], %[[SEL_10]] -// CHECK: %[[DIVSI_13:.*]] = arith.divsi %[[ADDI_12]], %[[STEP]] -// CHECK: %[[ADDI_14:.*]] = arith.addi %[[DIVSI_13]], %[[CM1]] -// CHECK: %[[MULI_15:.*]] = arith.muli %{{.*}}, %[[ADDI_14]] -// CHECK: %[[ADDI_16:.*]] = arith.addi %{{.*}}, %[[MULI_15]] -// CHECK: %[[CMPI_17:.*]] = arith.cmpi sge, %[[ADDI_14]], %[[C0]] -// CHECK: %[[ADDI_18:.*]] = arith.addi %[[DIVSI_13]], %{{.*}}-1 -// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ADDI_18]], %{{.*}}-1 -// CHECK: %[[MULI_20:.*]] = arith.muli %{{.*}}, %[[ADDI_19]] -// CHECK: %[[ADDI_21:.*]] = arith.addi %{{.*}}, %[[MULI_20]] -// CHECK: %[[CMPI_22:.*]] = arith.cmpi sge, %[[ADDI_19]], %[[C0]] -// CHECK: scf.if %[[CMPI_17]] { -// CHECK: memref.store %{{.*}}#0, %{{.*}}[%[[ADDI_21]]] +// CHECK: %[[SELECT_11:.*]] = arith.select %[[CMPI_10]], %[[C1]], %[[CM1]] +// CHECK: %[[SUBI_12:.*]] = arith.subi %[[UB]], %[[LB]] +// CHECK: %[[ADDI_13:.*]] = arith.addi %[[SUBI_12]], %[[STEP]] +// CHECK: %[[ADDI_14:.*]] = arith.addi %[[ADDI_13]], %[[SELECT_11]] +// CHECK: %[[DIVSI_15:.*]] = arith.divsi %[[ADDI_14]], %[[STEP]] +// CHECK: %[[SUBI_17:.*]] = arith.subi %[[DIVSI_15]], %[[C2]] +// CHECK: %[[MAXSI_18:.*]] = arith.maxsi %[[SUBI_17]], %[[C0]] +// CHECK: %[[MULI_19:.*]] = arith.muli %[[STEP]], %[[MAXSI_18]] +// CHECK: %[[ADDI_20:.*]] = arith.addi %[[LB]], %[[MULI_19]] +// CHECK: %[[ADDI_21:.*]] = arith.addi %[[MAXSI_18]], %[[C1]] +// CHECK: %[[CMPI_22:.*]] = arith.cmpi sge, %[[DIVSI_15]], %[[C1]] +// CHECK: %[[MULI_23:.*]] = arith.muli %[[STEP]], %[[ADDI_21]] +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[LB]], %[[MULI_23]] +// CHECK: %[[CMPI_25:.*]] = arith.cmpi sge, %[[DIVSI_15]], %[[C2]] +// CHECK: scf.if %[[CMPI_22]] { +// CHECK: memref.store %{{.*}}#0, %{{.*}}[%[[ADDI_20]]] // CHECK: } else { // CHECK: } -// CHECK: %[[IF_23:.*]] = scf.if %[[CMPI_22]] -> (f32) { -// CHECK: %[[ADDF_24:.*]] = arith.addf %{{.*}}#1, %{{.*}} -// CHECK: scf.yield %[[ADDF_24]] +// CHECK: %[[IF_26:.*]] = scf.if %[[CMPI_25]] +// CHECK: %[[ADDF_27:.*]] = arith.addf %{{.*}}#1, %{{.*}} +// CHECK: scf.yield %[[ADDF_27]] // CHECK: } else { // CHECK: scf.yield %{{.*}} // CHECK: } -// CHECK: scf.if %[[CMPI_22]] { -// CHECK: memref.store %[[IF_23]], %{{.*}}[%[[ADDI_16]]] +// CHECK: scf.if %[[CMPI_25]] { +// CHECK: memref.store %[[IF_26]], %{{.*}}[%[[ADDI_24]]] // CHECK: } else { // CHECK: } // CHECK: return @@ -842,6 +843,7 @@ func.func @dynamic_loop(%A: memref, %result: memref, %lb: index, % // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[CM1:.*]] = arith.constant -1 : index +// CHECK-DAG: %[[CF0:.*]] = arith.constant 0.000000e+00 // CHECK: %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}} // CHECK: %{{.*}}:2 = scf.for %[[ARG5:.*]] = %[[LB:.*]] to %[[UBM]] step %[[STEP:.*]] iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}) // CHECK: %[[ADDF_13:.*]] = arith.addf %[[ARG7]], %[[ARG6]] @@ -856,22 +858,21 @@ func.func @dynamic_loop(%A: memref, %result: memref, %lb: index, % // CHECK: %[[ADDI_7:.*]] = arith.addi %[[SUBI_6]], %[[STEP]] // CHECK: %[[ADDI_8:.*]] = arith.addi %[[ADDI_7]], %[[SELECT_5]] // CHECK: %[[DIVSI_9:.*]] = arith.divsi %[[ADDI_8]], %[[STEP]] -// CHECK: %[[ADDI_10:.*]] = arith.addi %[[DIVSI_9]], %[[CM1]] -// CHECK: %[[CMPI_11:.*]] = arith.cmpi sge, %[[ADDI_10]], %[[C0]] -// CHECK: %[[IF_10:.*]] = scf.if %[[CMPI_11]] -// CHECK: %[[ADDF_13:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0 -// CHECK: scf.yield %[[ADDF_13]] +// CHECK: %[[CMPI_10:.*]] = arith.cmpi sge, %[[DIVSI_9]], %[[C1]] +// CHECK: %[[IF_11:.*]] = scf.if %[[CMPI_10]] +// CHECK: %[[ADDF_14:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0 +// CHECK: scf.yield %[[ADDF_14]] // CHECK: } else { -// CHECK: scf.yield %{{.*}} +// CHECK: scf.yield %[[CF0]] // CHECK: } -// CHECK: %[[IF_11:.*]] = scf.if %[[CMPI_11]] -// CHECK: %[[MULF_13:.*]] = arith.mulf %[[IF_10]], %{{.*}} -// CHECK: scf.yield %[[MULF_13]] +// CHECK: %[[IF_12:.*]] = scf.if %[[CMPI_10]] +// CHECK: %[[MULF_14:.*]] = arith.mulf %[[IF_11]], %{{.*}} +// CHECK: scf.yield %[[MULF_14]] // CHECK: } else { -// CHECK: scf.yield %{{.*}} +// CHECK: scf.yield %[[CF0]] // CHECK: } -// CHECK: %[[SELECT_12:.*]] = arith.select %[[CMPI_11]], %[[IF_11]], %{{.*}}#0 -// CHECK: memref.store %[[SELECT_12]], %{{.*}}[%{{.*}}] +// CHECK: %[[SELECT_13:.*]] = arith.select %[[CMPI_10]], %[[IF_12]], %{{.*}}#0 +// CHECK: memref.store %[[SELECT_13]], %{{.*}}[%[[C0]]] func.func @dynamic_loop_result(%A: memref, %result: memref, %lb: index, %ub: index, %step: index) { %cf0 = arith.constant 1.0 : f32 %cf1 = arith.constant 33.0 : f32 diff --git a/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir b/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir deleted file mode 100644 index b6acfd53c1f5d9c72607640f2f829afc222a042b..0000000000000000000000000000000000000000 --- a/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir +++ /dev/null @@ -1,88 +0,0 @@ -// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-func-to-llvm --convert-arith-to-llvm | \ -// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ -// RUN: --shared-libs=%mlir_c_runner_utils | \ -// RUN: FileCheck %s --match-full-lines - -func.func @addi_i1(%v1 : i1, %v2 : i1) { - vector.print str "@addi_i1\n" - %res = arith.addi %v1, %v2 : i1 - vector.print %res : i1 - return -} - -func.func @addi() { - // ------------------------------------------------ - // Test i1 - // ------------------------------------------------ - - // addi on i1 - // addi(0, 1) : i1 = 1 : i1; addi(0, -1) : i1 = 1 - %false = arith.constant 0 : i1 - %true = arith.constant 1 : i1 - - // CHECK-LABEL: @addi_i1 - // CHECK-NEXT: 1 - func.call @addi_i1(%false, %true) : (i1, i1) -> () - - // CHECK-LABEL: @addi_i1 - // CHECK-NEXT: 1 - %true_based_on_non_zero_val = arith.constant -1 : i1 - func.call @addi_i1(%false, %true_based_on_non_zero_val) : (i1, i1) -> () - - // ------------------------------------------------ - // TODO: Test i8, i16 etc.. - // ------------------------------------------------ - - return -} - -func.func @addui_extended_i1(%v1 : i1, %v2 : i1) { - vector.print str "@addui_extended_i1\n" - %res, %overflow = arith.addui_extended %v1, %v2 : i1, i1 - vector.print %res : i1 - vector.print %overflow : i1 - return -} - -func.func @addi_extended() { - // ------------------------------------------------ - // Test i1 - // ------------------------------------------------ - - // addui_extended on i1 - // addui_extended 1 1 : i1 = 0, 1 - %true = arith.constant 1 : i1 - %false = arith.constant 0 : i1 - - // CHECK-LABEL: @addui_extended_i1 - // CHECK-NEXT: 0 - // CHECK-NEXT: 1 - func.call @addui_extended_i1(%true, %true) : (i1, i1) -> () - - // CHECK-LABEL: @addui_extended_i1 - // CHECK-NEXT: 1 - // CHECK-NEXT: 0 - func.call @addui_extended_i1(%true, %false) : (i1, i1) -> () - - // CHECK-LABEL: @addui_extended_i1 - // CHECK-NEXT: 1 - // CHECK-NEXT: 0 - func.call @addui_extended_i1(%false, %true) : (i1, i1) -> () - - // CHECK-LABEL: @addui_extended_i1 - // CHECK-NEXT: 0 - // CHECK-NEXT: 0 - func.call @addui_extended_i1(%false, %false) : (i1, i1) -> () - - // ------------------------------------------------ - // TODO: Test i8, i16 etc.. - // ------------------------------------------------ - return -} - -func.func @entry() { - func.call @addi() : () -> () - func.call @addi_extended() : () -> () - return -} diff --git a/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir b/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir deleted file mode 100644 index 418fbb0c0a94c7e97e946897733a85cf23c89203..0000000000000000000000000000000000000000 --- a/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir +++ /dev/null @@ -1,174 +0,0 @@ -// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-func-to-llvm --convert-arith-to-llvm | \ -// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ -// RUN: --shared-libs=%mlir_c_runner_utils | \ -// RUN: FileCheck %s --match-full-lines - -func.func @cmpi_eq_i1(%v1 : i1, %v2 : i1) { - vector.print str "@cmpi_eq_i1\n" - %res = arith.cmpi eq, %v1, %v2 : i1 - vector.print %res : i1 - return -} - -func.func @cmpi_slt_i1(%v1 : i1, %v2 : i1) { - vector.print str "@cmpi_slt_i1\n" - %res = arith.cmpi slt, %v1, %v2 : i1 - vector.print %res : i1 - return -} - -func.func @cmpi_sle_i1(%v1 : i1, %v2 : i1) { - vector.print str "@cmpi_sle_i1\n" - %res = arith.cmpi sle, %v1, %v2 : i1 - vector.print %res : i1 - return -} - -func.func @cmpi_sgt_i1(%v1 : i1, %v2 : i1) { - vector.print str "@cmpi_sgt_i1\n" - %res = arith.cmpi sgt, %v1, %v2 : i1 - vector.print %res : i1 - return -} - -func.func @cmpi_sge_i1(%v1 : i1, %v2 : i1) { - vector.print str "@cmpi_sge_i1\n" - %res = arith.cmpi sge, %v1, %v2 : i1 - vector.print %res : i1 - return -} - -func.func @cmpi_eq() { - // ------------------------------------------------ - // Test i1 - // ------------------------------------------------ - %false_i1 = arith.constant 0 : i1 - %true_i1 = arith.constant 1 : i1 - %true_i1_n1 = arith.constant -1 : i1 - - // int values 1 and -1 are represented with the same bitvector (`0b1`) - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 1 - func.call @cmpi_eq_i1(%true_i1, %true_i1_n1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 0 - func.call @cmpi_eq_i1(%false_i1, %true_i1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 0 - func.call @cmpi_eq_i1(%true_i1, %false_i1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 1 - func.call @cmpi_eq_i1(%true_i1, %true_i1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 1 - func.call @cmpi_eq_i1(%false_i1, %false_i1) : (i1, i1) -> () - - %false = arith.constant false - %true = arith.constant true - - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 1 - func.call @cmpi_eq_i1(%true, %true_i1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 1 - func.call @cmpi_eq_i1(%false, %false_i1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_eq_i1 - // CHECK-NEXT: 1 - func.call @cmpi_eq_i1(%true, %true_i1_n1) : (i1, i1) -> () - - // ------------------------------------------------ - // TODO: Test i8, i16 etc.. - // ------------------------------------------------ - return -} - -func.func @cmpi_signed() { - // ------------------------------------------------ - // Test i1 - // ------------------------------------------------ - %false_i1 = arith.constant 0 : i1 - %true_i1 = arith.constant 1 : i1 - %true_i1_n1 = arith.constant -1 : i1 - - // int values 1 and -1 are represented with the same bitvector (`0b1`) - // But, bitvector `1` is interpreted as int value -1 in signed comparison - - // CHECK-LABEL: @cmpi_sge_i1 - // CHECK-NEXT: 1 - func.call @cmpi_sge_i1(%false_i1, %true_i1_n1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_sge_i1 - // CHECK-NEXT: 1 - func.call @cmpi_sge_i1(%false_i1, %true_i1) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_sge_i1 - // CHECK-NEXT: 0 - func.call @cmpi_sge_i1(%true_i1, %false_i1) : (i1, i1) -> () - - %false = arith.constant false - %true = arith.constant true - - // CHECK-LABEL: @cmpi_slt_i1 - // CHECK-NEXT: 0 - func.call @cmpi_slt_i1(%false, %true) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_sle_i1 - // CHECK-NEXT: 0 - func.call @cmpi_sle_i1(%false, %true) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_sgt_i1 - // CHECK-NEXT: 1 - func.call @cmpi_sgt_i1(%false, %true) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_sge_i1 - // CHECK-NEXT: 1 - func.call @cmpi_sge_i1(%false, %true) : (i1, i1) -> () - - // CHECK-LABEL: @cmpi_sge_i1 - // CHECK-NEXT: 0 - func.call @cmpi_sge_i1(%true, %false) : (i1, i1) -> () - - // ------------------------------------------------ - // TODO: Test i8, i16 etc.. - // ------------------------------------------------ - return -} - -func.func @cmpi_ult_index(%v1 : index, %v2 : index) { - vector.print str "@cmpi_ult_index\n" - %res = arith.cmpi ult, %v1, %v2 : index - vector.print %res : i1 - return -} - -func.func @cmpi_unsigned() { - // ------------------------------------------------ - // Test index - // ------------------------------------------------ - // 0 `ult` -2^63 = true - %zero = arith.constant 0 : index - %index_min = arith.constant -9223372036854775808 : index - - // CHECK-LABEL: @cmpi_ult_index - // CHECK-NEXT: 1 - func.call @cmpi_ult_index(%zero, %index_min) : (index, index) -> () - - // ------------------------------------------------ - // TODO: i1, i8, i16, uge, ule etc.. - // ------------------------------------------------ - return -} - -func.func @entry() { - func.call @cmpi_eq() : () -> () - func.call @cmpi_signed() : () -> () - func.call @cmpi_unsigned() : () -> () - return -} diff --git a/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir b/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir deleted file mode 100644 index 21fd816788431e79aa17db8b81743e789ab9088d..0000000000000000000000000000000000000000 --- a/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir +++ /dev/null @@ -1,119 +0,0 @@ -// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-func-to-llvm --convert-arith-to-llvm | \ -// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ -// RUN: --shared-libs=%mlir_c_runner_utils | \ -// RUN: FileCheck %s --match-full-lines - -func.func @mulsi_extended_i1(%v1 : i1, %v2 : i1) { - vector.print str "@mulsi_extended_i1\n" - %low, %high = arith.mulsi_extended %v1, %v2 : i1 - vector.print %low : i1 - vector.print %high : i1 - return -} - -func.func @mulsi_extended_i8(%v1 : i8, %v2 : i8) { - vector.print str "@mulsi_extended_i8\n" - %low, %high = arith.mulsi_extended %v1, %v2 : i8 - vector.print %low : i8 - vector.print %high : i8 - return -} - -func.func @mulsi_extended() { - // ------------------------------------------------ - // Test i1 - // ------------------------------------------------ - - // mulsi_extended on i1, tests for overflow bit - // mulsi_extended 1, 1 : i1 = (1, 0) - %true = arith.constant true - %false = arith.constant false - - // CHECK-LABEL: @mulsi_extended_i1 - // CHECK-NEXT: 1 - // CHECK-NEXT: 0 - func.call @mulsi_extended_i1(%true, %true) : (i1, i1) -> () - - // CHECK-LABEL: @mulsi_extended_i1 - // CHECK-NEXT: 0 - // CHECK-NEXT: 0 - func.call @mulsi_extended_i1(%true, %false) : (i1, i1) -> () - - // CHECK-LABEL: @mulsi_extended_i1 - // CHECK-NEXT: 0 - // CHECK-NEXT: 0 - func.call @mulsi_extended_i1(%false, %true) : (i1, i1) -> () - - // CHECK-LABEL: @mulsi_extended_i1 - // CHECK-NEXT: 0 - // CHECK-NEXT: 0 - func.call @mulsi_extended_i1(%false, %false) : (i1, i1) -> () - - // ------------------------------------------------ - // Test i8 - // ------------------------------------------------ - // mulsi extended versions, with overflow - %c_100_i8 = arith.constant -100 : i8 - - // mulsi_extended -100, -100 : i8 = (16, 39) - // CHECK-LABEL: @mulsi_extended_i8 - // CHECK-NEXT: 16 - // CHECK-NEXT: 39 - func.call @mulsi_extended_i8(%c_100_i8, %c_100_i8) : (i8, i8) -> () - - // ------------------------------------------------ - // TODO: Test i16, i32 etc.. - // ------------------------------------------------ - return -} - -func.func @mului_extended_i8(%v1 : i8, %v2 : i8) { - vector.print str "@mului_extended_i8\n" - %low, %high = arith.mului_extended %v1, %v2 : i8 - vector.print %low : i8 - vector.print %high : i8 - return -} - -func.func @mului_extended() { - // ------------------------------------------------ - // Test i8 - // ------------------------------------------------ - %c_n100_i8 = arith.constant -100 : i8 - %c_156_i8 = arith.constant 156 : i8 - - // mului_extended -100, -100 : i8 = (16, 95) - // and on equivalent representations (e.g. 156 === -100 (mod 256)) - - // CHECK-LABEL: @mului_extended_i8 - // CHECK-NEXT: 16 - // CHECK-NEXT: 95 - func.call @mului_extended_i8(%c_n100_i8, %c_n100_i8) : (i8, i8) -> () - - // CHECK-LABEL: @mului_extended_i8 - // CHECK-NEXT: 16 - // CHECK-NEXT: 95 - func.call @mului_extended_i8(%c_n100_i8, %c_156_i8) : (i8, i8) -> () - - // CHECK-LABEL: @mului_extended_i8 - // CHECK-NEXT: 16 - // CHECK-NEXT: 95 - func.call @mului_extended_i8(%c_156_i8, %c_n100_i8) : (i8, i8) -> () - - // CHECK-LABEL: @mului_extended_i8 - // CHECK-NEXT: 16 - // CHECK-NEXT: 95 - func.call @mului_extended_i8(%c_156_i8, %c_156_i8) : (i8, i8) -> () - - // ------------------------------------------------ - // TODO: Test i1, i16, i32 etc.. - // ------------------------------------------------ - return -} - -func.func @entry() { - func.call @mulsi_extended() : () -> () - func.call @mului_extended() : () -> () - return -} diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index 28a1bd21c82a38d75d8d0b028c68eb3deefadb84..606b11175f572f8541b8d109f7525c49d0dc66bc 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -630,11 +630,21 @@ define void @va_intrinsics_test(ptr %0, ptr %1, ...) { ; CHECK-LABEL: @assume ; CHECK-SAME: %[[TRUE:[a-zA-Z0-9]+]] define void @assume(i1 %true) { - ; CHECK: "llvm.intr.assume"(%[[TRUE]]) : (i1) -> () + ; CHECK: llvm.intr.assume %[[TRUE]] : i1 call void @llvm.assume(i1 %true) ret void } +; CHECK-LABEL: @assume_with_opbundles +; CHECK-SAME: %[[TRUE:[a-zA-Z0-9]+]] +; CHECK-SAME: %[[PTR:[a-zA-Z0-9]+]] +define void @assume_with_opbundles(i1 %true, ptr %p) { + ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32 + ; CHECK: llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1 + call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)] + ret void +} + ; CHECK-LABEL: @is_constant ; CHECK-SAME: %[[VAL:[a-zA-Z0-9]+]] define void @is_constant(i32 %0) { diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index 0634a7ba907f1e00ab92dad838334aa77b723839..cb712eb4e1262dd4c5e638130b74e54c10b85e4d 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -363,6 +363,21 @@ llvm.func @umin_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector llvm.return } +// CHECK-LABEL: @assume_without_opbundles +llvm.func @assume_without_opbundles(%cond: i1) { + // CHECK: call void @llvm.assume(i1 %{{.+}}) + llvm.intr.assume %cond : i1 + llvm.return +} + +// CHECK-LABEL: @assume_with_opbundles +llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) { + %0 = llvm.mlir.constant(8 : i32) : i32 + // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ] + llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1 + llvm.return +} + // CHECK-LABEL: @vector_reductions llvm.func @vector_reductions(%arg0: f32, %arg1: vector<8xf32>, %arg2: vector<8xi32>) { // CHECK: call i32 @llvm.vector.reduce.add.v8i32 diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir index af0981440a177643dad1919e60647c5f9978353d..15658ea6068121953ad00a01f978d435286f4b6b 100644 --- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir @@ -188,7 +188,7 @@ llvm.func @sadd_overflow_intr_wrong_type(%arg0 : i32, %arg1 : f32) -> !llvm.stru llvm.func @assume_intr_wrong_type(%cond : i16) { // expected-error @below{{op operand #0 must be 1-bit signless integer, but got 'i16'}} - "llvm.intr.assume"(%cond) : (i16) -> () + llvm.intr.assume %cond : i16 llvm.return } diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 7fd082a5eb3c75a1ab5a812499d8a2fb4c922aa7..0471e5faf8457830f1b4e1ca8c0de0f72fe67345 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -62,10 +62,94 @@ llvm.func @nvvm_special_regs() -> i32 { %29 = nvvm.read.ptx.sreg.clock : i32 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64 %30 = nvvm.read.ptx.sreg.clock64 : i64 - - // CHECK: %31 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %31 = nvvm.read.ptx.sreg.tid.x range : i32 - + // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer + %31 = nvvm.read.ptx.sreg.globaltimer : i64 + // CHECK: %32 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %32 = nvvm.read.ptx.sreg.tid.x range : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid + %33 = nvvm.read.ptx.sreg.warpid : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid + %34 = nvvm.read.ptx.sreg.nwarpid : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid + %35 = nvvm.read.ptx.sreg.smid : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid + %36 = nvvm.read.ptx.sreg.nsmid : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid + %37 = nvvm.read.ptx.sreg.gridid : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg0 + %38 = nvvm.read.ptx.sreg.envreg0 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg1 + %39 = nvvm.read.ptx.sreg.envreg1 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg2 + %40 = nvvm.read.ptx.sreg.envreg2 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg3 + %41 = nvvm.read.ptx.sreg.envreg3 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg4 + %42 = nvvm.read.ptx.sreg.envreg4 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg5 + %43 = nvvm.read.ptx.sreg.envreg5 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg6 + %44 = nvvm.read.ptx.sreg.envreg6 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg7 + %45 = nvvm.read.ptx.sreg.envreg7 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg8 + %46 = nvvm.read.ptx.sreg.envreg8 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg9 + %47 = nvvm.read.ptx.sreg.envreg9 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg10 + %48 = nvvm.read.ptx.sreg.envreg10 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg11 + %49 = nvvm.read.ptx.sreg.envreg11 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg12 + %50 = nvvm.read.ptx.sreg.envreg12 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg13 + %51 = nvvm.read.ptx.sreg.envreg13 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg14 + %52 = nvvm.read.ptx.sreg.envreg14 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg15 + %53 = nvvm.read.ptx.sreg.envreg15 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg16 + %54 = nvvm.read.ptx.sreg.envreg16 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg17 + %55 = nvvm.read.ptx.sreg.envreg17 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg18 + %56 = nvvm.read.ptx.sreg.envreg18 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg19 + %57 = nvvm.read.ptx.sreg.envreg19 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg20 + %58 = nvvm.read.ptx.sreg.envreg20 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg21 + %59 = nvvm.read.ptx.sreg.envreg21 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg22 + %60 = nvvm.read.ptx.sreg.envreg22 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg23 + %61 = nvvm.read.ptx.sreg.envreg23 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg24 + %62 = nvvm.read.ptx.sreg.envreg24 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg25 + %63 = nvvm.read.ptx.sreg.envreg25 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg26 + %64 = nvvm.read.ptx.sreg.envreg26 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg27 + %65 = nvvm.read.ptx.sreg.envreg27 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg28 + %66 = nvvm.read.ptx.sreg.envreg28 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg29 + %67 = nvvm.read.ptx.sreg.envreg29 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg30 + %68 = nvvm.read.ptx.sreg.envreg30 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg31 + %69 = nvvm.read.ptx.sreg.envreg31 : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq + %70 = nvvm.read.ptx.sreg.lanemask.eq : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le + %71 = nvvm.read.ptx.sreg.lanemask.le : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt + %72 = nvvm.read.ptx.sreg.lanemask.lt : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge + %73 = nvvm.read.ptx.sreg.lanemask.ge : i32 + //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt + %74 = nvvm.read.ptx.sreg.lanemask.gt : i32 llvm.return %1 : i32 } diff --git a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir index 02ce6b5b19ceaf4159b56df25432a2e0122687cd..79412fb69f7583d0bf19a67900e72a162d35331a 100644 --- a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir +++ b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir @@ -74,27 +74,38 @@ llvm.func @parallel_op_firstprivate_multi_block(%arg0: !llvm.ptr) { // CHECK: [[PRIV_BB2]]: // CHECK-NEXT: %[[C1:.*]] = phi i32 [ 1, %[[PRIV_BB1]] ] // CHECK-NEXT: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[C1]], align 4 -// The entry block of the `copy` region is merged into the exit block of the -// `alloc` region. So check for that. +// CHECK-NEXT: br label %omp.region.cont + +// CHECK: omp.region.cont: +// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ] +// CHECK-NEXT: br label %omp.private.latealloc + +// CHECK: omp.private.latealloc: +// CHECK-NEXT: br label %omp.private.copy + +// CHECK: omp.private.copy: +// CHECK-NEXT: br label %omp.private.copy3 + +// CHECK: omp.private.copy3: // CHECK-NEXT: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4 // CHECK-NEXT: br label %[[PRIV_BB3:.*]] // Check contents of the 2nd block in the `copy` region. // CHECK: [[PRIV_BB3]]: -// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %[[PRIV_BB2]] ] -// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ] -// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC2]], align 4 +// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %omp.private.copy3 ] +// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %omp.private.copy3 ] +// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC3]], align 4 // CHECK-NEXT: br label %[[PRIV_CONT:.*]] // Check that the privatizer's continuation block yileds the private clone's // address. // CHECK: [[PRIV_CONT]]: -// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB3]] ] +// CHECK-NEXT: %[[PRIV_ALLOC4:.*]] = phi ptr [ %[[PRIV_ALLOC3]], %[[PRIV_BB3]] ] // CHECK-NEXT: br label %[[PAR_REG:.*]] // Check that the body of the parallel region loads from the private clone. // CHECK: [[PAR_REG]]: -// CHECK: %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4 +// CHECK: %{{.*}} = load float, ptr %[[PRIV_ALLOC2]], align 4 omp.private {type = firstprivate} @multi_block.privatizer : !llvm.ptr alloc { ^bb0(%arg0: !llvm.ptr): diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir index 6153e5685c29fd6bd01490a2c07ba80f581694df..5407f97286eb1aa00f712e04d150b96b138e12a5 100644 --- a/mlir/test/Target/LLVMIR/openmp-private.mlir +++ b/mlir/test/Target/LLVMIR/openmp-private.mlir @@ -104,6 +104,9 @@ llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) { // CHECK: omp.par.entry: // CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0 // CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8 +// CHECK: br label %omp.private.latealloc + +// CHECK: omp.private.latealloc: // CHECK: br label %[[PRIV_BB1:.*]] // Check contents of the first block in the `alloc` region. @@ -151,8 +154,7 @@ omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc { // CHECK: omp.par.region: // CHECK: br label %[[PAR_REG_BEG:.*]] // CHECK: [[PAR_REG_BEG]]: -// CHECK: %[[PRIVATIZER_GEP:.*]] = getelementptr double, ptr @_QQfoo, i64 111 -// CHECK: call void @bar(ptr %[[PRIVATIZER_GEP]]) +// CHECK: call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 111)) // CHECK: call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 222)) llvm.func @lower_region_with_addressof() { %0 = llvm.mlir.constant(1 : i64) : i64 diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 97276b087b7e9327a35275d8fb60d3fcd9dbe329..2f34070147be472b44de7e1f6247244ad212706b 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -179,6 +179,22 @@ llvm.func @rocdl.schedbarrier() { llvm.return } +llvm.func @rocdl.sched.group.barrier() { + // CHECK-LABEL: rocdl.sched.group.barrier + // CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + rocdl.sched.group.barrier 8, 1, 0 + llvm.return +} + +llvm.func @rocdl.iglp.opt() { + // CHECK-LABEL: rocdl.iglp.opt + // CHECK-NEXT: call void @llvm.amdgcn.iglp.opt(i32 0) + rocdl.iglp.opt 0 + // CHECK-NEXT: call void @llvm.amdgcn.iglp.opt(i32 1) + rocdl.iglp.opt 1 + llvm.return +} + llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32, %arg2 : vector<32 x f32>, %arg3: i32, %arg4 : vector<16 x f32>, %arg5 : vector<4xf32>, diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td index 31dd53725c59ac54263013a4cc534b830710d13f..a03d0b40d4655c7b6ffd54faa53c41874df23021 100644 --- a/mlir/test/mlir-tblgen/op-decl-and-defs.td +++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td @@ -208,6 +208,11 @@ def NS_FOp : NS_Op<"op_with_all_types_constraint", // CHECK-LABEL: class FOp : // CHECK: static ::llvm::LogicalResult inferReturnTypes +// DEFS: void FOp::build(::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState, ::mlir::Value a) { +// DEFS: if (::mlir::succeeded(FOp::inferReturnTypes(odsBuilder.getContext(), +// DEFS: else +// DEFS: ::mlir::detail::reportFatalInferReturnTypesError(odsState); + def NS_GOp : NS_Op<"op_with_fixed_return_type", []> { let arguments = (ins AnyType:$a); let results = (outs I32:$b); diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py index 3ea73e8beea36887ba31811d39b3570ebec2e6c3..d029b3bfa6b1187fb5a0fc0eda4276d4036f5dc0 100644 --- a/mlir/test/python/dialects/transform_structured_ext.py +++ b/mlir/test/python/dialects/transform_structured_ext.py @@ -304,7 +304,7 @@ def testPadOpNoArgs(target): # CHECK: transform.sequence # CHECK: transform.structured.pad # CHECK-NOT: copy_back_op - # CHECK-NOT: pack_paddings + # CHECK-NOT: nofold_flags # CHECK-NOT: pad_to_multiple_of # CHECK-NOT: padding_dimensions # CHECK-NOT: padding_values @@ -319,7 +319,7 @@ def testPadOpArgs(target): pad_to_multiple_of=[128], padding_values=[FloatAttr.get_f32(42.0), StringAttr.get("0")], padding_dimensions=Attribute.parse("[1]"), - pack_paddings=[0], + nofold_flags=[0], transpose_paddings=[[1, Attribute.parse("0")], Attribute.parse("[0, 1]")], copy_back_op="linalg.copy", ) @@ -328,7 +328,7 @@ def testPadOpArgs(target): # CHECK: transform.structured.pad # CHECK-DAG: pad_to_multiple_of [128] # CHECK-DAG: copy_back_op = "linalg.copy" - # CHECK-DAG: pack_paddings = [0] + # CHECK-DAG: nofold_flags = [0] # CHECK-DAG: padding_dimensions = [1] # CHECK-DAG: padding_values = [4.200000e+01 : f32, "0"] # CHECK-DAG: transpose_paddings = {{\[}}[1, 0], [0, 1]] diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp index 8716667723a3730dc78b68c21467689a4b4a3276..04f81a4a2dce1abc2d575dfcf149eb291359f676 100644 --- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp +++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp @@ -106,9 +106,7 @@ static bool verifyArgument(const DagInit *arguments, StringRef argName, const Init *argInit) { auto range = zip_equal(arguments->getArgNames(), arguments->getArgs()); return llvm::any_of( - range, - [&](std::tuple - v) { + range, [&](std::tuple v) { return std::get<0>(v)->getAsUnquotedString() == argName && std::get<1>(v) == argInit; }); diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index ce2b6ed94c39490acdd426dcb6889c42ebf4198e..71fa5011a476b415f7441d5be3cdb84b8b040809 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -2503,7 +2503,8 @@ void OpEmitter::genSeparateArgParamBuilder() { {1}.regions, inferredReturnTypes))) {1}.addTypes(inferredReturnTypes); else - ::llvm::report_fatal_error("Failed to infer result type(s).");)", + ::mlir::detail::reportFatalInferReturnTypesError({1}); + )", opClass.getClassName(), builderOpState); return; } diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt index 547e536dd9cbbf8c2ece1f97c1be9372c2552452..384116ba5c457e277dea897644279dd188cc0742 100644 --- a/mlir/unittests/IR/CMakeLists.txt +++ b/mlir/unittests/IR/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_unittest(MLIRIRTests AffineMapTest.cpp AttributeTest.cpp AttrTypeReplacerTest.cpp + Diagnostic.cpp DialectTest.cpp InterfaceTest.cpp IRMapping.cpp diff --git a/mlir/unittests/IR/Diagnostic.cpp b/mlir/unittests/IR/Diagnostic.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96e09d33309263f3f37bc2c7cbf202c7d831823b --- /dev/null +++ b/mlir/unittests/IR/Diagnostic.cpp @@ -0,0 +1,63 @@ +//===- Diagnostic.cpp - Dialect unit tests -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/Diagnostics.h" +#include "mlir/Support/TypeID.h" +#include "gtest/gtest.h" + +using namespace mlir; +using namespace mlir::detail; + +namespace { + +TEST(DiagnosticLifetime, TestCopiesConstCharStar) { + const auto *expectedMessage = "Error 1, don't mutate this"; + + // Copy expected message into a mutable container, and call the constructor. + std::string myStr(expectedMessage); + + mlir::MLIRContext context; + Diagnostic diagnostic(mlir::UnknownLoc::get(&context), + DiagnosticSeverity::Note); + diagnostic << myStr.c_str(); + + // Mutate underlying pointer, but ensure diagnostic still has orig. message + myStr[0] = '^'; + + std::string resultMessage; + llvm::raw_string_ostream stringStream(resultMessage); + diagnostic.print(stringStream); + ASSERT_STREQ(expectedMessage, resultMessage.c_str()); +} + +TEST(DiagnosticLifetime, TestLazyCopyStringLiteral) { + char charArr[21] = "Error 1, mutate this"; + mlir::MLIRContext context; + Diagnostic diagnostic(mlir::UnknownLoc::get(&context), + DiagnosticSeverity::Note); + + // Diagnostic contains optimization which assumes string literals are + // represented by `const char[]` type. This is imperfect as we can sometimes + // trick the type system as seen below. + // + // Still we use this to check the diagnostic is lazily storing the pointer. + auto addToDiagnosticAsConst = [&diagnostic](const char(&charArr)[21]) { + diagnostic << charArr; + }; + addToDiagnosticAsConst(charArr); + + // Mutate the underlying pointer and ensure the string does change + charArr[0] = '^'; + + std::string resultMessage; + llvm::raw_string_ostream stringStream(resultMessage); + diagnostic.print(stringStream); + ASSERT_STREQ("^rror 1, mutate this", resultMessage.c_str()); +} + +} // namespace diff --git a/openmp/runtime/doc/doxygen/config b/openmp/runtime/doc/doxygen/config index 04c966766ba6ef9ce3a3dea236def676787c8d3e..8d79dc143cc1a01bb814fc4517a5fb28703e20a8 100644 --- a/openmp/runtime/doc/doxygen/config +++ b/openmp/runtime/doc/doxygen/config @@ -1,1822 +1,1822 @@ -# Doxyfile 1.o8.2 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (" "). - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all -# text before the first occurrence of this tag. Doxygen uses libiconv (or the -# iconv built into libc) for the transcoding. See -# http://www.gnu.org/software/libiconv for the list of possible encodings. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or sequence of words) that should -# identify the project. Note that if you do not use Doxywizard you need -# to put quotes around the project name if it contains spaces. - -PROJECT_NAME = "LLVM OpenMP* Runtime Library" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. -# This could be handy for archiving the generated documentation or -# if some version control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer -# a quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = - -# With the PROJECT_LOGO tag one can specify an logo or icon that is -# included in the documentation. The maximum height of the logo should not -# exceed 55 pixels and the maximum width should not exceed 200 pixels. -# Doxygen will copy the logo to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) -# base path where the generated documentation will be put. -# If a relative path is entered, it will be relative to the location -# where doxygen was started. If left blank the current directory will be used. - -OUTPUT_DIRECTORY = doc/doxygen/generated - -# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create -# 4096 sub-directories (in 2 levels) under the output directory of each output -# format and will distribute the generated files over these directories. -# Enabling this option can be useful when feeding doxygen a huge amount of -# source files, where putting all generated files in the same directory would -# otherwise cause performance problems for the file system. - -CREATE_SUBDIRS = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# The default language is English, other supported languages are: -# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, -# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, -# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English -# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, -# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, -# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will -# include brief member descriptions after the members that are listed in -# the file and class documentation (similar to JavaDoc). -# Set to NO to disable this. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend -# the brief description of a member or function before the detailed description. -# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator -# that is used to form the text in various listings. Each string -# in this list, if found as the leading text of the brief description, will be -# stripped from the text and the result after processing the whole list, is -# used as the annotated text. Otherwise, the brief description is used as-is. -# If left blank, the following values are used ("$name" is automatically -# replaced with the name of the entity): "The $name class" "The $name widget" -# "The $name file" "is" "provides" "specifies" "contains" -# "represents" "a" "an" "the" - -ABBREVIATE_BRIEF = - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# Doxygen will generate a detailed section even if there is only a brief -# description. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full -# path before files name in the file list and in the header files. If set -# to NO the shortest path that makes the file name unique will be used. - -FULL_PATH_NAMES = NO - -# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag -# can be used to strip a user-defined part of the path. Stripping is -# only done if one of the specified strings matches the left-hand part of -# the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the -# path to strip. Note that you specify absolute paths here, but also -# relative paths, which will be relative from the directory where doxygen is -# started. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of -# the path mentioned in the documentation of a class, which tells -# the reader which header file to include in order to use a class. -# If left blank only the name of the header file containing the class -# definition is used. Otherwise one should specify the include paths that -# are normally passed to the compiler using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter -# (but less readable) file names. This can be useful if your file system -# doesn't support long names like on DOS, Mac, or CD-ROM. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen -# will interpret the first line (until the first dot) of a JavaDoc-style -# comment as the brief description. If set to NO, the JavaDoc -# comments will behave just like regular Qt-style comments -# (thus requiring an explicit @brief command for a brief description.) - -JAVADOC_AUTOBRIEF = NO - -# If the QT_AUTOBRIEF tag is set to YES then Doxygen will -# interpret the first line (until the first dot) of a Qt-style -# comment as the brief description. If set to NO, the comments -# will behave just like regular Qt-style comments (thus requiring -# an explicit \brief command for a brief description.) - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen -# treat a multi-line C++ special comment block (i.e. a block of //! or /// -# comments) as a brief description. This used to be the default behaviour. -# The new default is to treat a multi-line C++ comment block as a detailed -# description. Set this tag to YES if you prefer the old behaviour instead. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented -# member inherits the documentation from any documented member that it -# re-implements. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce -# a new page for each member. If set to NO, the documentation of a member will -# be part of the file/class/namespace that contains it. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. -# Doxygen uses this value to replace tabs by spaces in code fragments. - -TAB_SIZE = 8 - -# This tag can be used to specify a number of aliases that acts -# as commands in the documentation. An alias has the form "name=value". -# For example adding "sideeffect=\par Side Effects:\n" will allow you to -# put the command \sideeffect (or @sideeffect) in the documentation, which -# will result in a user-defined paragraph with heading "Side Effects:". -# You can put \n's in the value part of an alias to insert newlines. - -ALIASES = "other=*" - -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding -# "class=itcl::class" will allow you to use the command class in the -# itcl::class meaning. - -TCL_SUBST = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C -# sources only. Doxygen will then generate output that is more tailored for C. -# For instance, some of the names that are used will be different. The list -# of all members will be omitted, etc. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java -# sources only. Doxygen will then generate output that is more tailored for -# Java. For instance, namespaces will be presented as packages, qualified -# scopes will look different, etc. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources only. Doxygen will then generate output that is more tailored for -# Fortran. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for -# VHDL. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, -# and language is one of the parsers supported by doxygen: IDL, Java, -# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, -# C++. For instance to make doxygen treat .inc files as Fortran files (default -# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note -# that for custom extensions you also need to set FILE_PATTERNS otherwise the -# files are not read by doxygen. - -EXTENSION_MAPPING = - -# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all -# comments according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you -# can mix doxygen, HTML, and XML commands with Markdown formatting. -# Disable only in case of backward compatibilities issues. - -MARKDOWN_SUPPORT = YES - -# When enabled doxygen tries to link words that correspond to documented classes, -# or namespaces to their corresponding documentation. Such a link can be -# prevented in individual cases by by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should -# set this tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. -# func(std::string) {}). This also makes the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. -# Doxygen will parse them like normal C++ but will assume all classes use public -# instead of private inheritance when no explicit protection keyword is present. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to -# indicate getter and setter methods for a property. Setting this -# option to YES (the default) will make doxygen replace the get and -# set methods by a property in the documentation. This will only work -# if the methods are indeed getting or setting a simple type. If this -# is not the case, or you want to show the methods anyway, you should -# set this option to NO. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES, then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. - -DISTRIBUTE_GROUP_DOC = NO - -# Set the SUBGROUPING tag to YES (the default) to allow class member groups of -# the same type (for instance a group of public functions) to be put as a -# subgroup of that type (e.g. under the Public Functions section). Set it to -# NO to prevent subgrouping. Alternatively, this can be done per class using -# the \nosubgrouping command. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and -# unions are shown inside the group in which they are included (e.g. using -# @ingroup) instead of on a separate page (for HTML and Man pages) or -# section (for LaTeX and RTF). - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and -# unions with only public data fields will be shown inline in the documentation -# of the scope in which they are defined (i.e. file, namespace, or group -# documentation), provided this scope is documented. If set to NO (the default), -# structs, classes, and unions are shown on a separate page (for HTML and Man -# pages) or section (for LaTeX and RTF). - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum -# is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically -# be useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. - -TYPEDEF_HIDES_STRUCT = NO - -# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to -# determine which symbols to keep in memory and which to flush to disk. -# When the cache is full, less often used symbols will be written to disk. -# For small to medium size projects (<1000 input files) the default value is -# probably good enough. For larger projects a too small cache size can cause -# doxygen to be busy swapping symbols to and from disk most of the time -# causing a significant performance penalty. -# If the system has enough physical memory increasing the cache will improve the -# performance by keeping more symbols in memory. Note that the value works on -# a logarithmic scale so increasing the size by one will roughly double the -# memory usage. The cache size is given by this formula: -# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols. - -SYMBOL_CACHE_SIZE = 0 - -# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be -# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given -# their name and scope. Since this can be an expensive process and often the -# same symbol appear multiple times in the code, doxygen keeps a cache of -# pre-resolved symbols. If the cache is too small doxygen will become slower. -# If the cache is too large, memory is wasted. The cache size is given by this -# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols. - -LOOKUP_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in -# documentation are documented, even if no documentation was available. -# Private class members and static file members will be hidden unless -# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES - -EXTRACT_ALL = NO - -# If the EXTRACT_PRIVATE tag is set to YES all private members of a class -# will be included in the documentation. - -EXTRACT_PRIVATE = YES - -# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal -# scope will be included in the documentation. - -EXTRACT_PACKAGE = NO - -# If the EXTRACT_STATIC tag is set to YES all static members of a file -# will be included in the documentation. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) -# defined locally in source files will be included in the documentation. -# If set to NO only classes defined in header files are included. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. When set to YES local -# methods, which are defined in the implementation section but not in -# the interface are included in the documentation. -# If set to NO (the default) only methods in the interface are included. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base -# name of the file that contains the anonymous namespace. By default -# anonymous namespaces are hidden. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all -# undocumented members of documented classes, files or namespaces. -# If set to NO (the default) these members will be included in the -# various overviews, but no documentation section is generated. -# This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_MEMBERS = YES - -# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. -# If set to NO (the default) these classes will be included in the various -# overviews. This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_CLASSES = YES - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all -# friend (class|struct|union) declarations. -# If set to NO (the default) these declarations will be included in the -# documentation. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any -# documentation blocks found inside the body of a function. -# If set to NO (the default) these blocks will be appended to the -# function's detailed documentation block. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation -# that is typed after a \internal command is included. If the tag is set -# to NO (the default) then the documentation will be excluded. -# Set it to YES to include the internal documentation. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate -# file names in lower-case letters. If set to YES upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. - -CASE_SENSE_NAMES = YES - -# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen -# will show members with their full class and namespace scopes in the -# documentation. If set to YES the scope will be hidden. - -HIDE_SCOPE_NAMES = NO - -# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen -# will put a list of the files that are included by a file in the documentation -# of that file. - -SHOW_INCLUDE_FILES = YES - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen -# will list include files with double quotes in the documentation -# rather than with sharp brackets. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] -# is inserted in the documentation for inline members. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen -# will sort the (detailed) documentation of file and class members -# alphabetically by member name. If set to NO the members will appear in -# declaration order. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the -# brief documentation of file, namespace and class members alphabetically -# by member name. If set to NO (the default) the members will appear in -# declaration order. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen -# will sort the (brief and detailed) documentation of class members so that -# constructors and destructors are listed first. If set to NO (the default) -# the constructors will appear in the respective orders defined by -# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. -# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO -# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the -# hierarchy of group names into alphabetical order. If set to NO (the default) -# the group names will appear in their defined order. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be -# sorted by fully-qualified names, including namespaces. If set to -# NO (the default), the class list will be sorted only by class name, -# not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the -# alphabetical list. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to -# do proper type resolution of all parameters of a function it will reject a -# match between the prototype and the implementation of a member function even -# if there is only one candidate or it is obvious which candidate to choose -# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen -# will still accept a match between prototype and implementation in such cases. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or -# disable (NO) the todo list. This list is created by putting \todo -# commands in the documentation. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or -# disable (NO) the test list. This list is created by putting \test -# commands in the documentation. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or -# disable (NO) the bug list. This list is created by putting \bug -# commands in the documentation. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or -# disable (NO) the deprecated list. This list is created by putting -# \deprecated commands in the documentation. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional -# documentation sections, marked by \if sectionname ... \endif. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines -# the initial value of a variable or macro consists of for it to appear in -# the documentation. If the initializer consists of more lines than specified -# here it will be hidden. Use a value of 0 to hide initializers completely. -# The appearance of the initializer of individual variables and macros in the -# documentation can be controlled using \showinitializer or \hideinitializer -# command in the documentation regardless of this setting. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated -# at the bottom of the documentation of classes and structs. If set to YES the -# list will mention the files that were used to generate the documentation. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. -# This will remove the Files entry from the Quick Index and from the -# Folder Tree View (if specified). The default is YES. - -# We probably will want this, but we have no file documentation yet so it's simpler to remove -# it for now. -SHOW_FILES = NO - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the -# Namespaces page. -# This will remove the Namespaces entry from the Quick Index -# and from the Folder Tree View (if specified). The default is YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command , where is the value of -# the FILE_VERSION_FILTER tag, and is the name of an input file -# provided by doxygen. Whatever the program writes to standard output -# is used as the file version. See the manual for examples. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. -# You can optionally specify a file name after the option, if omitted -# DoxygenLayout.xml will be used as the name of the layout file. - -LAYOUT_FILE = - -# The CITE_BIB_FILES tag can be used to specify one or more bib files -# containing the references data. This must be a list of .bib files. The -# .bib extension is automatically appended if omitted. Using this command -# requires the bibtex tool to be installed. See also -# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style -# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this -# feature you need bibtex and perl available in the search path. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated -# by doxygen. Possible values are YES and NO. If left blank NO is used. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated by doxygen. Possible values are YES and NO. If left blank -# NO is used. - -WARNINGS = YES - -# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings -# for undocumented members. If EXTRACT_ALL is set to YES then this flag will -# automatically be disabled. - -WARN_IF_UNDOCUMENTED = YES - -# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some -# parameters in a documented function, or documenting parameters that -# don't exist or using markup commands wrongly. - -WARN_IF_DOC_ERROR = YES - -# The WARN_NO_PARAMDOC option can be enabled to get warnings for -# functions that are documented, but have no documentation for their parameters -# or return value. If set to NO (the default) doxygen will only warn about -# wrong or incomplete parameter documentation, but not about the absence of -# documentation. - -WARN_NO_PARAMDOC = NO - -# The WARN_FORMAT tag determines the format of the warning messages that -# doxygen can produce. The string should contain the $file, $line, and $text -# tags, which will be replaced by the file and line number from which the -# warning originated and the warning text. Optionally the format may contain -# $version, which will be replaced by the version of the file (if it could -# be obtained via FILE_VERSION_FILTER) - -WARN_FORMAT = - -# The WARN_LOGFILE tag can be used to specify a file to which warning -# and error messages should be written. If left blank the output is written -# to stderr. - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag can be used to specify the files and/or directories that contain -# documented source files. You may enter file names like "myfile.cpp" or -# directories like "/usr/src/myproject". Separate the files or directories -# with spaces. - -INPUT = src doc/doxygen/libomp_interface.h -# The ittnotify code also has doxygen documentation, but if we include it here -# it takes over from us! -# src/thirdparty/ittnotify - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is -# also the default input encoding. Doxygen uses libiconv (or the iconv built -# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for -# the list of possible encodings. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank the following patterns are tested: -# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh -# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py -# *.f90 *.f *.for *.vhd *.vhdl - -FILE_PATTERNS = *.c *.h *.cpp -# We may also want to include the asm files with appropriate ifdef to ensure -# doxygen doesn't see the content, just the documentation... - -# The RECURSIVE tag can be used to turn specify whether or not subdirectories -# should be searched for input files as well. Possible values are YES and NO. -# If left blank NO is used. - -# Only look in the one directory. -RECURSIVE = NO - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = src/test-touch.c - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. Note that the wildcards are matched -# against the file with absolute path, so to exclude all test directories -# for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or -# directories that contain example code fragments that are included (see -# the \include command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank all files are included. - -EXAMPLE_PATTERNS = - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude -# commands irrespective of the value of the RECURSIVE tag. -# Possible values are YES and NO. If left blank NO is used. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or -# directories that contain image that are included in the documentation (see -# the \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command , where -# is the value of the INPUT_FILTER tag, and is the name of an -# input file. Doxygen will then use the output that the filter program writes -# to standard output. -# If FILTER_PATTERNS is specified, this tag will be -# ignored. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. -# Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. -# The filters are a list of the form: -# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further -# info on how filters are used. If FILTER_PATTERNS is empty or if -# non of the patterns match the file name, INPUT_FILTER is applied. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will be used to filter the input files when producing source -# files to browse (i.e. when SOURCE_BROWSER is set to YES). - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) -# and it is also possible to disable source filtering for a specific pattern -# using *.ext= (so without naming a filter). This option only has effect when -# FILTER_SOURCE_FILES is enabled. - -FILTER_SOURCE_PATTERNS = - -#--------------------------------------------------------------------------- -# configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will -# be generated. Documented entities will be cross-referenced with these sources. -# Note: To get rid of all source code in the generated output, make sure also -# VERBATIM_HEADERS is set to NO. - -SOURCE_BROWSER = YES - -# Setting the INLINE_SOURCES tag to YES will include the body -# of functions and classes directly in the documentation. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct -# doxygen to hide any special comment blocks from generated source code -# fragments. Normal C, C++ and Fortran comments will always remain visible. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES -# then for each documented function all documented -# functions referencing it will be listed. - -REFERENCED_BY_RELATION = YES - -# If the REFERENCES_RELATION tag is set to YES -# then for each documented function all documented entities -# called/used by that function will be listed. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) -# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from -# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will -# link to the source code. -# Otherwise they will link to the documentation. - -REFERENCES_LINK_SOURCE = YES - -# If the USE_HTAGS tag is set to YES then the references to source code -# will point to the HTML generated by the htags(1) tool instead of doxygen -# built-in source browser. The htags tool is part of GNU's global source -# tagging system (see http://www.gnu.org/software/global/global.html). You -# will need version 4.8.6 or higher. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen -# will generate a verbatim copy of the header file for each class for -# which an include is specified. Set to NO to disable this. - -VERBATIM_HEADERS = YES - -#--------------------------------------------------------------------------- -# configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index -# of all compounds will be generated. Enable this if the project -# contains a lot of classes, structs, unions or interfaces. - -ALPHABETICAL_INDEX = YES - -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all -# classes will be put under the same header in the alphabetical index. -# The IGNORE_PREFIX tag can be used to specify one or more prefixes that -# should be ignored while generating the index headers. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES (the default) Doxygen will -# generate HTML output. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `html' will be used as the default path. - -HTML_OUTPUT = - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for -# each generated HTML page (for example: .htm,.php,.asp). If it is left blank -# doxygen will generate files with .html extension. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a personal HTML header for -# each generated HTML page. If it is left blank doxygen will generate a -# standard header. Note that when using a custom header you are responsible -# for the proper inclusion of any scripts and style sheets that doxygen -# needs, which is dependent on the configuration options used. -# It is advised to generate a default header using "doxygen -w html -# header.html footer.html stylesheet.css YourConfigFile" and then modify -# that header. Note that the header is subject to change so you typically -# have to redo this when upgrading to a newer version of doxygen or when -# changing the value of configuration settings such as GENERATE_TREEVIEW! - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a personal HTML footer for -# each generated HTML page. If it is left blank doxygen will generate a -# standard footer. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading -# style sheet that is used by each HTML page. It can be used to -# fine-tune the look of the HTML output. If left blank doxygen will -# generate a default style sheet. Note that it is recommended to use -# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this -# tag will in the future become obsolete. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional -# user-defined cascading style sheet that is included after the standard -# style sheets created by doxygen. Using this option one can overrule -# certain style aspects. This is preferred over using HTML_STYLESHEET -# since it does not replace the standard style sheet and is therefor more -# robust against future updates. Doxygen will copy the style sheet file to -# the output directory. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that -# the files will be copied as-is; there are no commands or markers available. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. -# Doxygen will adjust the colors in the style sheet and background images -# according to this color. Hue is specified as an angle on a colorwheel, -# see http://en.wikipedia.org/wiki/Hue for more information. -# For instance the value 0 represents red, 60 is yellow, 120 is green, -# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. -# The allowed range is 0 to 359. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of -# the colors in the HTML output. For a value of 0 the output will use -# grayscales only. A value of 255 will produce the most vivid colors. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to -# the luminance component of the colors in the HTML output. Values below -# 100 gradually make the output lighter, whereas values above 100 make -# the output darker. The value divided by 100 is the actual gamma applied, -# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, -# and 100 does not change the gamma. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting -# this to NO can help when comparing the output of multiple runs. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of -# entries shown in the various tree structured indices initially; the user -# can expand and collapse entries dynamically later on. Doxygen will expand -# the tree to such a level that at most the specified number of entries are -# visible (unless a fully collapsed tree already exceeds this amount). -# So setting the number of entries 1 will produce a full collapsed tree by -# default. 0 is a special value representing an infinite number of entries -# and will result in a full expanded tree by default. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files -# will be generated that can be used as input for Apple's Xcode 3 -# integrated development environment, introduced with OSX 10.5 (Leopard). -# To create a documentation set, doxygen will generate a Makefile in the -# HTML output directory. Running make will produce the docset in that -# directory and running "make install" will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find -# it at startup. -# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. - -GENERATE_DOCSET = NO - -# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the -# feed. A documentation feed provides an umbrella under which multiple -# documentation sets from a single provider (such as a company or product suite) -# can be grouped. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that -# should uniquely identify the documentation set bundle. This should be a -# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen -# will append .docset to the name. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely -# identify the documentation publisher. This should be a reverse domain-name -# style string, e.g. com.mycompany.MyDocSet.documentation. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES, additional index files -# will be generated that can be used as input for tools like the -# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) -# of the generated HTML documentation. - -GENERATE_HTMLHELP = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can -# be used to specify the file name of the resulting .chm file. You -# can add a path in front of the file if the result should not be -# written to the html output directory. - -CHM_FILE = - -# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can -# be used to specify the location (absolute path including file name) of -# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run -# the HTML help compiler on the generated index.hhp. - -HHC_LOCATION = - -# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag -# controls if a separate .chi index file is generated (YES) or that -# it should be included in the main .chm file (NO). - -GENERATE_CHI = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING -# is used to encode HtmlHelp index (hhk), content (hhc) and project file -# content. - -CHM_INDEX_ENCODING = - -# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag -# controls whether a binary table of contents is generated (YES) or a -# normal table of contents (NO) in the .chm file. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members -# to the contents of the HTML help documentation and to the tree view. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated -# that can be used as input for Qt's qhelpgenerator to generate a -# Qt Compressed Help (.qch) of the generated HTML documentation. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can -# be used to specify the file name of the resulting .qch file. -# The path specified is relative to the HTML output folder. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#namespace - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#virtual-folders - -QHP_VIRTUAL_FOLDER = doc - -# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to -# add. For more information please see -# http://doc.trolltech.com/qthelpproject.html#custom-filters - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see -# -# Qt Help Project / Custom Filters. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's -# filter section matches. -# -# Qt Help Project / Filter Attributes. - -QHP_SECT_FILTER_ATTRS = - -# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can -# be used to specify the location of Qt's qhelpgenerator. -# If non-empty doxygen will try to run qhelpgenerator on the generated -# .qhp file. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files -# will be generated, which together with the HTML files, form an Eclipse help -# plugin. To install this plugin and make it available under the help contents -# menu in Eclipse, the contents of the directory containing the HTML and XML -# files needs to be copied into the plugins directory of eclipse. The name of -# the directory within the plugins directory should be the same as -# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before -# the help appears. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have -# this name. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) -# at top of each HTML page. The value NO (the default) enables the index and -# the value YES disables it. Since the tabs have the same information as the -# navigation tree you can set this option to NO if you already set -# GENERATE_TREEVIEW to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. -# If the tag value is set to YES, a side panel will be generated -# containing a tree-like index structure (just like the one that -# is generated for HTML Help). For this to work a browser that supports -# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). -# Windows users are probably better off using the HTML help feature. -# Since the tree basically has the same information as the tab index you -# could consider to set DISABLE_INDEX to NO when enabling this option. - -GENERATE_TREEVIEW = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values -# (range [0,1..20]) that doxygen will group on one line in the generated HTML -# documentation. Note that a value of 0 will completely suppress the enum -# values from appearing in the overview section. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be -# used to set the initial width (in pixels) of the frame in which the tree -# is shown. - -TREEVIEW_WIDTH = 250 - -# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open -# links to external symbols imported via tag files in a separate window. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of Latex formulas included -# as images in the HTML documentation. The default is 10. Note that -# when you change the font size after a successful doxygen run you need -# to manually remove any form_*.png images from the HTML output directory -# to force them to be regenerated. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are -# not supported properly for IE 6.0, but are supported on all modern browsers. -# Note that when changing this option you need to delete any form_*.png files -# in the HTML output before the changes have effect. - -FORMULA_TRANSPARENT = YES - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax -# (see http://www.mathjax.org) which uses client side Javascript for the -# rendering instead of using prerendered bitmaps. Use this if you do not -# have LaTeX installed or if you want to formulas look prettier in the HTML -# output. When enabled you may also need to install MathJax separately and -# configure the path to it using the MATHJAX_RELPATH option. - -USE_MATHJAX = NO - -# When MathJax is enabled you need to specify the location relative to the -# HTML output directory using the MATHJAX_RELPATH option. The destination -# directory should contain the MathJax.js script. For instance, if the mathjax -# directory is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to -# the MathJax Content Delivery Network so you can quickly see the result without -# installing MathJax. -# However, it is strongly recommended to install a local -# copy of MathJax from http://www.mathjax.org before deployment. - -MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest - -# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension -# names that should be enabled during MathJax rendering. - -MATHJAX_EXTENSIONS = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box -# for the HTML output. The underlying search engine uses javascript -# and DHTML and should work on any modern browser. Note that when using -# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets -# (GENERATE_DOCSET) there is already a search function so this one should -# typically be disabled. For large projects the javascript based search engine -# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. - -SEARCHENGINE = YES - -# When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a PHP enabled web server instead of at the web client -# using Javascript. Doxygen will generate the search PHP script and index -# file to put on the web server. The advantage of the server -# based approach is that it scales better to large projects and allows -# full text search. The disadvantages are that it is more difficult to setup -# and does not have live searching capabilities. - -SERVER_BASED_SEARCH = NO - -#--------------------------------------------------------------------------- -# configuration options related to the LaTeX output -#--------------------------------------------------------------------------- - -# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will -# generate Latex output. - -GENERATE_LATEX = YES - -# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `latex' will be used as the default path. - -LATEX_OUTPUT = - -# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be -# invoked. If left blank `latex' will be used as the default command name. -# Note that when enabling USE_PDFLATEX this option is only used for -# generating bitmaps for formulas in the HTML output, but not in the -# Makefile that is written to the output directory. - -LATEX_CMD_NAME = latex - -# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to -# generate index for LaTeX. If left blank `makeindex' will be used as the -# default command name. - -MAKEINDEX_CMD_NAME = makeindex - -# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact -# LaTeX documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_LATEX = NO - -# The PAPER_TYPE tag can be used to set the paper type that is used -# by the printer. Possible values are: a4, letter, legal and -# executive. If left blank a4wide will be used. - -PAPER_TYPE = a4wide - -# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX -# packages that should be included in the LaTeX output. - -EXTRA_PACKAGES = - -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for -# the generated latex document. The header should contain everything until -# the first chapter. If it is left blank doxygen will generate a -# standard header. Notice: only use this tag if you know what you are doing! - -LATEX_HEADER = doc/doxygen/header.tex - -# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for -# the generated latex document. The footer should contain everything after -# the last chapter. If it is left blank doxygen will generate a -# standard footer. Notice: only use this tag if you know what you are doing! - -LATEX_FOOTER = - -# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated -# is prepared for conversion to pdf (using ps2pdf). The pdf file will -# contain links (just like the HTML output) instead of page references -# This makes the output suitable for online browsing using a pdf viewer. - -PDF_HYPERLINKS = YES - -# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of -# plain latex in the generated Makefile. Set this option to YES to get a -# higher quality PDF documentation. - -USE_PDFLATEX = YES - -# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. -# command to the generated LaTeX files. This will instruct LaTeX to keep -# running if errors occur, instead of asking the user for help. -# This option is also used when generating formulas in HTML. - -LATEX_BATCHMODE = NO - -# If LATEX_HIDE_INDICES is set to YES then doxygen will not -# include the index chapters (such as File Index, Compound Index, etc.) -# in the output. - -LATEX_HIDE_INDICES = NO - -# If LATEX_SOURCE_CODE is set to YES then doxygen will include -# source code with syntax highlighting in the LaTeX output. -# Note that which sources are shown also depends on other settings -# such as SOURCE_BROWSER. - -LATEX_SOURCE_CODE = NO - -# The LATEX_BIB_STYLE tag can be used to specify the style to use for the -# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See -# http://en.wikipedia.org/wiki/BibTeX for more info. - -LATEX_BIB_STYLE = plain - -#--------------------------------------------------------------------------- -# configuration options related to the RTF output -#--------------------------------------------------------------------------- - -# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output -# The RTF output is optimized for Word 97 and may not look very pretty with -# other RTF readers or editors. - -GENERATE_RTF = NO - -# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `rtf' will be used as the default path. - -RTF_OUTPUT = - -# If the COMPACT_RTF tag is set to YES Doxygen generates more compact -# RTF documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_RTF = NO - -# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated -# will contain hyperlink fields. The RTF file will -# contain links (just like the HTML output) instead of page references. -# This makes the output suitable for online browsing using WORD or other -# programs which support those fields. -# Note: wordpad (write) and others do not support links. - -RTF_HYPERLINKS = NO - -# Load style sheet definitions from file. Syntax is similar to doxygen's -# config file, i.e. a series of assignments. You only have to provide -# replacements, missing definitions are set to their default value. - -RTF_STYLESHEET_FILE = - -# Set optional variables used in the generation of an rtf document. -# Syntax is similar to doxygen's config file. - -RTF_EXTENSIONS_FILE = - -#--------------------------------------------------------------------------- -# configuration options related to the man page output -#--------------------------------------------------------------------------- - -# If the GENERATE_MAN tag is set to YES (the default) Doxygen will -# generate man pages - -GENERATE_MAN = NO - -# The MAN_OUTPUT tag is used to specify where the man pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `man' will be used as the default path. - -MAN_OUTPUT = - -# The MAN_EXTENSION tag determines the extension that is added to -# the generated man pages (default is the subroutine's section .3) - -MAN_EXTENSION = - -# If the MAN_LINKS tag is set to YES and Doxygen generates man output, -# then it will generate one additional man file for each entity -# documented in the real man page(s). These additional files -# only source the real man page, but without them the man command -# would be unable to find the correct page. The default is NO. - -MAN_LINKS = NO - -#--------------------------------------------------------------------------- -# configuration options related to the XML output -#--------------------------------------------------------------------------- - -# If the GENERATE_XML tag is set to YES Doxygen will -# generate an XML file that captures the structure of -# the code including all documentation. - -GENERATE_XML = NO - -# The XML_OUTPUT tag is used to specify where the XML pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `xml' will be used as the default path. - -XML_OUTPUT = xml - -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - -# If the XML_PROGRAMLISTING tag is set to YES Doxygen will -# dump the program listings (including syntax highlighting -# and cross-referencing information) to the XML output. Note that -# enabling this will significantly increase the size of the XML output. - -XML_PROGRAMLISTING = YES - -#--------------------------------------------------------------------------- -# configuration options for the AutoGen Definitions output -#--------------------------------------------------------------------------- - -# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will -# generate an AutoGen Definitions (see autogen.sf.net) file -# that captures the structure of the code including all -# documentation. Note that this feature is still experimental -# and incomplete at the moment. - -GENERATE_AUTOGEN_DEF = NO - -#--------------------------------------------------------------------------- -# configuration options related to the Perl module output -#--------------------------------------------------------------------------- - -# If the GENERATE_PERLMOD tag is set to YES Doxygen will -# generate a Perl module file that captures the structure of -# the code including all documentation. Note that this -# feature is still experimental and incomplete at the -# moment. - -GENERATE_PERLMOD = NO - -# If the PERLMOD_LATEX tag is set to YES Doxygen will generate -# the necessary Makefile rules, Perl scripts and LaTeX code to be able -# to generate PDF and DVI output from the Perl module output. - -PERLMOD_LATEX = NO - -# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be -# nicely formatted so it can be parsed by a human reader. -# This is useful -# if you want to understand what is going on. -# On the other hand, if this -# tag is set to NO the size of the Perl module output will be much smaller -# and Perl will parse it just the same. - -PERLMOD_PRETTY = YES - -# The names of the make variables in the generated doxyrules.make file -# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. -# This is useful so different doxyrules.make files included by the same -# Makefile don't overwrite each other's variables. - -PERLMOD_MAKEVAR_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the preprocessor -#--------------------------------------------------------------------------- - -# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will -# evaluate all C-preprocessor directives found in the sources and include -# files. - -ENABLE_PREPROCESSING = YES - -# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro -# names in the source code. If set to NO (the default) only conditional -# compilation will be performed. Macro expansion can be done in a controlled -# way by setting EXPAND_ONLY_PREDEF to YES. - -MACRO_EXPANSION = YES - -# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES -# then the macro expansion is limited to the macros specified with the -# PREDEFINED and EXPAND_AS_DEFINED tags. - -EXPAND_ONLY_PREDEF = YES - -# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files -# pointed to by INCLUDE_PATH will be searched when a #include is found. - -SEARCH_INCLUDES = YES - -# The INCLUDE_PATH tag can be used to specify one or more directories that -# contain include files that are not input files but should be processed by -# the preprocessor. - -INCLUDE_PATH = - -# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard -# patterns (like *.h and *.hpp) to filter out the header-files in the -# directories. If left blank, the patterns specified with FILE_PATTERNS will -# be used. - -INCLUDE_FILE_PATTERNS = - -# The PREDEFINED tag can be used to specify one or more macro names that -# are defined before the preprocessor is started (similar to the -D option of -# gcc). The argument of the tag is a list of macros of the form: name -# or name=definition (no spaces). If the definition and the = are -# omitted =1 is assumed. To prevent a macro definition from being -# undefined via #undef or recursively expanded use the := operator -# instead of the = operator. - -PREDEFINED = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1 - -# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then -# this tag can be used to specify a list of macro names that should be expanded. -# The macro definition that is found in the sources will be used. -# Use the PREDEFINED tag if you want to use a different macro definition that -# overrules the definition found in the source code. - -EXPAND_AS_DEFINED = - -# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then -# doxygen's preprocessor will remove all references to function-like macros -# that are alone on a line, have an all uppercase name, and do not end with a -# semicolon, because these will confuse the parser if not removed. - -SKIP_FUNCTION_MACROS = YES - -#--------------------------------------------------------------------------- -# Configuration::additions related to external references -#--------------------------------------------------------------------------- - -# The TAGFILES option can be used to specify one or more tagfiles. For each -# tag file the location of the external documentation should be added. The -# format of a tag file without this location is as follows: -# -# TAGFILES = file1 file2 ... -# Adding location for the tag files is done as follows: -# -# TAGFILES = file1=loc1 "file2 = loc2" ... -# where "loc1" and "loc2" can be relative or absolute paths -# or URLs. Note that each tag file must have a unique name (where the name does -# NOT include the path). If a tag file is not located in the directory in which -# doxygen is run, you must also specify the path to the tagfile here. - -TAGFILES = - -# When a file name is specified after GENERATE_TAGFILE, doxygen will create -# a tag file that is based on the input files it reads. - -GENERATE_TAGFILE = - -# If the ALLEXTERNALS tag is set to YES all external classes will be listed -# in the class index. If set to NO only the inherited external classes -# will be listed. - -ALLEXTERNALS = NO - -# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed -# in the modules index. If set to NO, only the current project's groups will -# be listed. - -EXTERNAL_GROUPS = YES - -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of `which perl'). - -PERL_PATH = - -#--------------------------------------------------------------------------- -# Configuration options related to the dot tool -#--------------------------------------------------------------------------- - -# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will -# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base -# or super classes. Setting the tag to NO turns the diagrams off. Note that -# this option also works with HAVE_DOT disabled, but it is recommended to -# install and use dot, since it yields more powerful graphs. - -CLASS_DIAGRAMS = YES - -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see -# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - -# If set to YES, the inheritance and collaboration graphs will hide -# inheritance and usage relations if the target is undocumented -# or is not a class. - -HIDE_UNDOC_RELATIONS = YES - -# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is -# available from the path. This tool is part of Graphviz, a graph visualization -# toolkit from AT&T and Lucent Bell Labs. The other options in this section -# have no effect if this option is set to NO (the default) - -HAVE_DOT = NO - -# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is -# allowed to run in parallel. When set to 0 (the default) doxygen will -# base this on the number of processors available in the system. You can set it -# explicitly to a value larger than 0 to get control over the balance -# between CPU load and processing speed. - -DOT_NUM_THREADS = 0 - -# By default doxygen will use the Helvetica font for all dot files that -# doxygen generates. When you want a differently looking font you can specify -# the font name using DOT_FONTNAME. You need to make sure dot is able to find -# the font, which can be done by putting it in a standard location or by setting -# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the -# directory containing the font. - -DOT_FONTNAME = Helvetica - -# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. -# The default size is 10pt. - -DOT_FONTSIZE = 10 - -# By default doxygen will tell dot to use the Helvetica font. -# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to -# set the path where dot can find it. - -DOT_FONTPATH = - -# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect inheritance relations. Setting this tag to YES will force the -# CLASS_DIAGRAMS tag to NO. - -CLASS_GRAPH = YES - -# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect implementation dependencies (inheritance, containment, and -# class references variables) of the class with other documented classes. - -COLLABORATION_GRAPH = NO - -# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for groups, showing the direct groups dependencies - -GROUP_GRAPHS = YES - -# If the UML_LOOK tag is set to YES doxygen will generate inheritance and -# collaboration diagrams in a style similar to the OMG's Unified Modeling -# Language. - -UML_LOOK = NO - -# If the UML_LOOK tag is enabled, the fields and methods are shown inside -# the class node. If there are many fields or methods and many nodes the -# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS -# threshold limits the number of items for each type to make the size more -# manageable. Set this to 0 for no limit. Note that the threshold may be -# exceeded by 50% before the limit is enforced. - -UML_LIMIT_NUM_FIELDS = 10 - -# If set to YES, the inheritance and collaboration graphs will show the -# relations between templates and their instances. - -TEMPLATE_RELATIONS = YES - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT -# tags are set to YES then doxygen will generate a graph for each documented -# file showing the direct and indirect include dependencies of the file with -# other documented files. - -INCLUDE_GRAPH = NO - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and -# HAVE_DOT tags are set to YES then doxygen will generate a graph for each -# documented header file showing the documented files that directly or -# indirectly include this file. - -INCLUDED_BY_GRAPH = NO - -# If the CALL_GRAPH and HAVE_DOT options are set to YES then -# doxygen will generate a call dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable call graphs -# for selected functions only using the \callgraph command. - -CALL_GRAPH = NO - -# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then -# doxygen will generate a caller dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable caller -# graphs for selected functions only using the \callergraph command. - -CALLER_GRAPH = NO - -# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen -# will generate a graphical hierarchy of all classes instead of a textual one. - -GRAPHICAL_HIERARCHY = YES - -# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES -# then doxygen will show the dependencies a directory has on other directories -# in a graphical way. The dependency relations are determined by the #include -# relations between the files in the directories. - -DIRECTORY_GRAPH = YES - -# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images -# generated by dot. Possible values are svg, png, jpg, or gif. -# If left blank png will be used. If you choose svg you need to set -# HTML_FILE_EXTENSION to xhtml in order to make the SVG files -# visible in IE 9+ (other browsers do not have this requirement). - -DOT_IMAGE_FORMAT = png - -# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to -# enable generation of interactive SVG images that allow zooming and panning. -# Note that this requires a modern browser other than Internet Explorer. -# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you -# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files -# visible. Older versions of IE do not have SVG support. - -INTERACTIVE_SVG = NO - -# The tag DOT_PATH can be used to specify the path where the dot tool can be -# found. If left blank, it is assumed the dot tool can be found in the path. - -DOT_PATH = - -# The DOTFILE_DIRS tag can be used to specify one or more directories that -# contain dot files that are included in the documentation (see the -# \dotfile command). - -DOTFILE_DIRS = - -# The MSCFILE_DIRS tag can be used to specify one or more directories that -# contain msc files that are included in the documentation (see the -# \mscfile command). - -MSCFILE_DIRS = - -# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of -# nodes that will be shown in the graph. If the number of nodes in a graph -# becomes larger than this value, doxygen will truncate the graph, which is -# visualized by representing a node as a red box. Note that doxygen if the -# number of direct children of the root node in a graph is already larger than -# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note -# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. - -DOT_GRAPH_MAX_NODES = 50 - -# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the -# graphs generated by dot. A depth value of 3 means that only nodes reachable -# from the root by following a path via at most 3 edges will be shown. Nodes -# that lay further from the root node will be omitted. Note that setting this -# option to 1 or 2 may greatly reduce the computation time needed for large -# code bases. Also note that the size of a graph can be further restricted by -# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. - -MAX_DOT_GRAPH_DEPTH = 0 - -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not -# seem to support this out of the box. Warning: Depending on the platform used, -# enabling this option may lead to badly anti-aliased labels on the edges of -# a graph (i.e. they become hard to read). - -DOT_TRANSPARENT = NO - -# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output -# files in one run (i.e. multiple -o and -T options on the command line). This -# makes dot run faster, but since only newer versions of dot (>1.8.10) -# support this, this feature is disabled by default. - -DOT_MULTI_TARGETS = NO - -# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will -# generate a legend page explaining the meaning of the various boxes and -# arrows in the dot generated graphs. - -GENERATE_LEGEND = YES - -# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will -# remove the intermediate dot files that are used to generate -# the various graphs. - -DOT_CLEANUP = YES +# Doxyfile 1.o8.2 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" "). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or sequence of words) that should +# identify the project. Note that if you do not use Doxywizard you need +# to put quotes around the project name if it contains spaces. + +PROJECT_NAME = "LLVM OpenMP* Runtime Library" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer +# a quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify an logo or icon that is +# included in the documentation. The maximum height of the logo should not +# exceed 55 pixels and the maximum width should not exceed 200 pixels. +# Doxygen will copy the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc/doxygen/generated + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = NO + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. Note that you specify absolute paths here, but also +# relative paths, which will be relative from the directory where doxygen is +# started. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful if your file system +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = "other=*" + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding +# "class=itcl::class" will allow you to use the command class in the +# itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, +# and language is one of the parsers supported by doxygen: IDL, Java, +# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, +# C++. For instance to make doxygen treat .inc files as Fortran files (default +# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note +# that for custom extensions you also need to set FILE_PATTERNS otherwise the +# files are not read by doxygen. + +EXTENSION_MAPPING = + +# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all +# comments according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you +# can mix doxygen, HTML, and XML commands with Markdown formatting. +# Disable only in case of backward compatibilities issues. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented classes, +# or namespaces to their corresponding documentation. Such a link can be +# prevented in individual cases by by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also makes the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to +# indicate getter and setter methods for a property. Setting this +# option to YES (the default) will make doxygen replace the get and +# set methods by a property in the documentation. This will only work +# if the methods are indeed getting or setting a simple type. If this +# is not the case, or you want to show the methods anyway, you should +# set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and +# unions are shown inside the group in which they are included (e.g. using +# @ingroup) instead of on a separate page (for HTML and Man pages) or +# section (for LaTeX and RTF). + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and +# unions with only public data fields will be shown inline in the documentation +# of the scope in which they are defined (i.e. file, namespace, or group +# documentation), provided this scope is documented. If set to NO (the default), +# structs, classes, and unions are shown on a separate page (for HTML and Man +# pages) or section (for LaTeX and RTF). + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penalty. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will roughly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +SYMBOL_CACHE_SIZE = 0 + +# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be +# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given +# their name and scope. Since this can be an expensive process and often the +# same symbol appear multiple times in the code, doxygen keeps a cache of +# pre-resolved symbols. If the cache is too small doxygen will become slower. +# If the cache is too large, memory is wasted. The cache size is given by this +# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal +# scope will be included in the documentation. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespaces are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to +# do proper type resolution of all parameters of a function it will reject a +# match between the prototype and the implementation of a member function even +# if there is only one candidate or it is obvious which candidate to choose +# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen +# will still accept a match between prototype and implementation in such cases. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or macro consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and macros in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +# We probably will want this, but we have no file documentation yet so it's simpler to remove +# it for now. +SHOW_FILES = NO + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. +# You can optionally specify a file name after the option, if omitted +# DoxygenLayout.xml will be used as the name of the layout file. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files +# containing the references data. This must be a list of .bib files. The +# .bib extension is automatically appended if omitted. Using this command +# requires the bibtex tool to be installed. See also +# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style +# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this +# feature you need bibtex and perl available in the search path. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# The WARN_NO_PARAMDOC option can be enabled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = src doc/doxygen/libomp_interface.h +# The ittnotify code also has doxygen documentation, but if we include it here +# it takes over from us! +# src/thirdparty/ittnotify + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh +# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py +# *.f90 *.f *.for *.vhd *.vhdl + +FILE_PATTERNS = *.c *.h *.cpp +# We may also want to include the asm files with appropriate ifdef to ensure +# doxygen doesn't see the content, just the documentation... + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +# Only look in the one directory. +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = src/test-touch.c + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty or if +# non of the patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) +# and it is also possible to disable source filtering for a specific pattern +# using *.ext= (so without naming a filter). This option only has effect when +# FILTER_SOURCE_FILES is enabled. + +FILTER_SOURCE_PATTERNS = + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C, C++ and Fortran comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. Note that when using a custom header you are responsible +# for the proper inclusion of any scripts and style sheets that doxygen +# needs, which is dependent on the configuration options used. +# It is advised to generate a default header using "doxygen -w html +# header.html footer.html stylesheet.css YourConfigFile" and then modify +# that header. Note that the header is subject to change so you typically +# have to redo this when upgrading to a newer version of doxygen or when +# changing the value of configuration settings such as GENERATE_TREEVIEW! + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If left blank doxygen will +# generate a default style sheet. Note that it is recommended to use +# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this +# tag will in the future become obsolete. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional +# user-defined cascading style sheet that is included after the standard +# style sheets created by doxygen. Using this option one can overrule +# certain style aspects. This is preferred over using HTML_STYLESHEET +# since it does not replace the standard style sheet and is therefor more +# robust against future updates. Doxygen will copy the style sheet file to +# the output directory. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that +# the files will be copied as-is; there are no commands or markers available. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. +# Doxygen will adjust the colors in the style sheet and background images +# according to this color. Hue is specified as an angle on a colorwheel, +# see http://en.wikipedia.org/wiki/Hue for more information. +# For instance the value 0 represents red, 60 is yellow, 120 is green, +# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. +# The allowed range is 0 to 359. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of +# the colors in the HTML output. For a value of 0 the output will use +# grayscales only. A value of 255 will produce the most vivid colors. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to +# the luminance component of the colors in the HTML output. Values below +# 100 gradually make the output lighter, whereas values above 100 make +# the output darker. The value divided by 100 is the actual gamma applied, +# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, +# and 100 does not change the gamma. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of +# entries shown in the various tree structured indices initially; the user +# can expand and collapse entries dynamically later on. Doxygen will expand +# the tree to such a level that at most the specified number of entries are +# visible (unless a fully collapsed tree already exceeds this amount). +# So setting the number of entries 1 will produce a full collapsed tree by +# default. 0 is a special value representing an infinite number of entries +# and will result in a full expanded tree by default. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely +# identify the documentation publisher. This should be a reverse domain-name +# style string, e.g. com.mycompany.MyDocSet.documentation. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the main .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated +# that can be used as input for Qt's qhelpgenerator to generate a +# Qt Compressed Help (.qch) of the generated HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to +# add. For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see +# +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's +# filter section matches. +# +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) +# at top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. Since the tabs have the same information as the +# navigation tree you can set this option to NO if you already set +# GENERATE_TREEVIEW to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. +# Since the tree basically has the same information as the tab index you +# could consider to set DISABLE_INDEX to NO when enabling this option. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values +# (range [0,1..20]) that doxygen will group on one line in the generated HTML +# documentation. Note that a value of 0 will completely suppress the enum +# values from appearing in the overview section. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open +# links to external symbols imported via tag files in a separate window. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are +# not supported properly for IE 6.0, but are supported on all modern browsers. +# Note that when changing this option you need to delete any form_*.png files +# in the HTML output before the changes have effect. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax +# (see http://www.mathjax.org) which uses client side Javascript for the +# rendering instead of using prerendered bitmaps. Use this if you do not +# have LaTeX installed or if you want to formulas look prettier in the HTML +# output. When enabled you may also need to install MathJax separately and +# configure the path to it using the MATHJAX_RELPATH option. + +USE_MATHJAX = NO + +# When MathJax is enabled you need to specify the location relative to the +# HTML output directory using the MATHJAX_RELPATH option. The destination +# directory should contain the MathJax.js script. For instance, if the mathjax +# directory is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to +# the MathJax Content Delivery Network so you can quickly see the result without +# installing MathJax. +# However, it is strongly recommended to install a local +# copy of MathJax from http://www.mathjax.org before deployment. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension +# names that should be enabled during MathJax rendering. + +MATHJAX_EXTENSIONS = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvantages are that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = doc/doxygen/header.tex + +# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for +# the generated latex document. The footer should contain everything after +# the last chapter. If it is left blank doxygen will generate a +# standard footer. Notice: only use this tag if you know what you are doing! + +LATEX_FOOTER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +# The LATEX_BIB_STYLE tag can be used to specify the style to use for the +# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See +# http://en.wikipedia.org/wiki/BibTeX for more info. + +LATEX_BIB_STYLE = plain + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load style sheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = YES + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = YES + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# pointed to by INCLUDE_PATH will be searched when a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1 + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition that +# overrules the definition found in the source code. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all references to function-like macros +# that are alone on a line, have an all uppercase name, and do not end with a +# semicolon, because these will confuse the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. For each +# tag file the location of the external documentation should be added. The +# format of a tag file without this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths +# or URLs. Note that each tag file must have a unique name (where the name does +# NOT include the path). If a tag file is not located in the directory in which +# doxygen is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option also works with HAVE_DOT disabled, but it is recommended to +# install and use dot, since it yields more powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is +# allowed to run in parallel. When set to 0 (the default) doxygen will +# base this on the number of processors available in the system. You can set it +# explicitly to a value larger than 0 to get control over the balance +# between CPU load and processing speed. + +DOT_NUM_THREADS = 0 + +# By default doxygen will use the Helvetica font for all dot files that +# doxygen generates. When you want a differently looking font you can specify +# the font name using DOT_FONTNAME. You need to make sure dot is able to find +# the font, which can be done by putting it in a standard location or by setting +# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the +# directory containing the font. + +DOT_FONTNAME = Helvetica + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the Helvetica font. +# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to +# set the path where dot can find it. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = NO + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If the UML_LOOK tag is enabled, the fields and methods are shown inside +# the class node. If there are many fields or methods and many nodes the +# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS +# threshold limits the number of items for each type to make the size more +# manageable. Set this to 0 for no limit. Note that the threshold may be +# exceeded by 50% before the limit is enforced. + +UML_LIMIT_NUM_FIELDS = 10 + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = NO + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will generate a graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are svg, png, jpg, or gif. +# If left blank png will be used. If you choose svg you need to set +# HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible in IE 9+ (other browsers do not have this requirement). + +DOT_IMAGE_FORMAT = png + +# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to +# enable generation of interactive SVG images that allow zooming and panning. +# Note that this requires a modern browser other than Internet Explorer. +# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you +# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible. Older versions of IE do not have SVG support. + +INTERACTIVE_SVG = NO + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the +# \mscfile command). + +MSCFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index f08ef5a6f64376651bfeb94beb995336307be5a6..8af388d9517c821283ce0b1bad7a8347ab3b2091 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -5787,7 +5787,8 @@ static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) { // __kmp_print_tdg_dot: prints the TDG to a dot file // tdg: ID of the TDG -void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) { +// gtid: Global Thread ID +void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) { kmp_int32 tdg_id = tdg->tdg_id; KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id)); @@ -5989,7 +5990,7 @@ void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) { KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0); if (__kmp_tdg_dot) - __kmp_print_tdg_dot(tdg); + __kmp_print_tdg_dot(tdg, gtid); } // __kmpc_end_record_task: wrapper around __kmp_end_record to mark diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp index c58763603cfa22341416dc884ad922510b77bf61..b76d8f4c18a551089e8f98e7c759a30b8c10acc9 100644 --- a/polly/lib/CodeGen/BlockGenerators.cpp +++ b/polly/lib/CodeGen/BlockGenerators.cpp @@ -786,12 +786,6 @@ void BlockGenerator::generateScalarStores( Builder.GetInsertBlock())) && "Domination violation"); - // The new Val might have a different type than the old Val due to - // ScalarEvolution looking through bitcasts. - Address = Builder.CreateBitOrPointerCast( - Address, Val->getType()->getPointerTo( - Address->getType()->getPointerAddressSpace())); - Builder.CreateStore(Val, Address); }); } diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp index 3f07f02038a1ecc730a124c6d5b86e072f669b5b..d76f6251ea4ced4826c1d95682992aa3fac9328a 100644 --- a/polly/lib/CodeGen/IslNodeBuilder.cpp +++ b/polly/lib/CodeGen/IslNodeBuilder.cpp @@ -1050,8 +1050,6 @@ Value *IslNodeBuilder::preloadUnconditionally(__isl_take isl_set *AccessRange, auto *Ptr = AddressValue; auto Name = Ptr->getName(); - auto AS = Ptr->getType()->getPointerAddressSpace(); - Ptr = Builder.CreatePointerCast(Ptr, Ty->getPointerTo(AS), Name + ".cast"); PreloadVal = Builder.CreateLoad(Ty, Ptr, Name + ".load"); if (LoadInst *PreloadInst = dyn_cast(PreloadVal)) PreloadInst->setAlignment(cast(AccInst)->getAlign()); diff --git a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp index cd440b28202e6d2b2c347e58b5cda093445c4a10..b98416a92097f245cb7e8c8c1cf2fdab5afa232f 100644 --- a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp +++ b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp @@ -183,7 +183,7 @@ Value *ParallelLoopGeneratorGOMP::createCallGetWorkItem(Value *LBPtr, // If F is not available, declare it. if (!F) { GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {LongType->getPointerTo(), LongType->getPointerTo()}; + Type *Params[] = {Builder.getPtrTy(0), Builder.getPtrTy(0)}; FunctionType *Ty = FunctionType::get(Builder.getInt8Ty(), Params, false); F = Function::Create(Ty, Linkage, Name, M); } diff --git a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp index 4ec5afe6aa63174d9a5430166417825ca54d9edc..0cfe18b0c12175896b845d1c2bf4ebbc86bd73c5 100644 --- a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp +++ b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp @@ -28,27 +28,23 @@ void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn, if (!KMPCMicroTy) { // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...) - Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(), - Builder.getInt32Ty()->getPointerTo()}; + Type *MicroParams[] = {Builder.getPtrTy(0), Builder.getPtrTy(0)}; KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true); } // If F is not available, declare it. if (!F) { - StructType *IdentTy = - StructType::getTypeByName(M->getContext(), "struct.ident_t"); - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), - KMPCMicroTy->getPointerTo()}; + Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), + Builder.getPtrTy(0)}; FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true); F = Function::Create(Ty, Linkage, Name, M); } - Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast( - SubFn, KMPCMicroTy->getPointerTo()); + Value *Task = + Builder.CreatePointerBitCastOrAddrSpaceCast(SubFn, Builder.getPtrTy(0)); Value *Args[] = {SourceLocationInfo, Builder.getInt32(4) /* Number of arguments (w/o Task) */, @@ -77,12 +73,9 @@ void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn, } Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const { - std::vector Arguments = {Builder.getInt32Ty()->getPointerTo(), - Builder.getInt32Ty()->getPointerTo(), - LongType, - LongType, - LongType, - Builder.getPtrTy()}; + std::vector Arguments = { + Builder.getPtrTy(0), Builder.getPtrTy(0), LongType, LongType, LongType, + Builder.getPtrTy()}; FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); Function *SubFn = Function::Create(FT, Function::InternalLinkage, @@ -320,11 +313,8 @@ Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() { // If F is not available, declare it. if (!F) { - StructType *IdentTy = - StructType::getTypeByName(M->getContext(), "struct.ident_t"); - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {IdentTy->getPointerTo()}; + Type *Params[] = {Builder.getPtrTy(0)}; FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); F = Function::Create(Ty, Linkage, Name, M); @@ -342,11 +332,8 @@ void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID, // If F is not available, declare it. if (!F) { - StructType *IdentTy = - StructType::getTypeByName(M->getContext(), "struct.ident_t"); - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), + Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), Builder.getInt32Ty()}; FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); @@ -367,20 +354,18 @@ void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID, const std::string Name = is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4"; Function *F = M->getFunction(Name); - StructType *IdentTy = - StructType::getTypeByName(M->getContext(), "struct.ident_t"); // If F is not available, declare it. if (!F) { GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {IdentTy->getPointerTo(), + Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), Builder.getInt32Ty(), - Builder.getInt32Ty()->getPointerTo(), - LongType->getPointerTo(), - LongType->getPointerTo(), - LongType->getPointerTo(), + Builder.getPtrTy(0), + Builder.getPtrTy(0), + Builder.getPtrTy(0), + Builder.getPtrTy(0), LongType, LongType}; @@ -408,13 +393,11 @@ void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID, void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) { const std::string Name = "__kmpc_for_static_fini"; Function *F = M->getFunction(Name); - StructType *IdentTy = - StructType::getTypeByName(M->getContext(), "struct.ident_t"); // If F is not available, declare it. if (!F) { GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()}; + Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty()}; FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); F = Function::Create(Ty, Linkage, Name, M); } @@ -432,14 +415,12 @@ void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID, const std::string Name = is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4"; Function *F = M->getFunction(Name); - StructType *IdentTy = - StructType::getTypeByName(M->getContext(), "struct.ident_t"); // If F is not available, declare it. if (!F) { GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {IdentTy->getPointerTo(), + Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), Builder.getInt32Ty(), LongType, @@ -474,19 +455,14 @@ Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID, const std::string Name = is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4"; Function *F = M->getFunction(Name); - StructType *IdentTy = - StructType::getTypeByName(M->getContext(), "struct.ident_t"); // If F is not available, declare it. if (!F) { GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {IdentTy->getPointerTo(), - Builder.getInt32Ty(), - Builder.getInt32Ty()->getPointerTo(), - LongType->getPointerTo(), - LongType->getPointerTo(), - LongType->getPointerTo()}; + Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), + Builder.getPtrTy(0), Builder.getPtrTy(0), + Builder.getPtrTy(0), Builder.getPtrTy(0)}; FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); F = Function::Create(Ty, Linkage, Name, M); diff --git a/pstl/CREDITS.txt b/pstl/CREDITS.txt index 4945fd5ad308be1cbf3cfa4c77d6108fc62dca71..174722510fdea433d8d5a2c1efc66a00675d284b 100644 --- a/pstl/CREDITS.txt +++ b/pstl/CREDITS.txt @@ -1,21 +1,21 @@ -This file is a partial list of people who have contributed to the LLVM/pstl -(Parallel STL) project. If you have contributed a patch or made some other -contribution to LLVM/pstl, please submit a patch to this file to add yourself, -and it will be done! - -The list is sorted by surname and formatted to allow easy grepping and -beautification by scripts. The fields are: name (N), email (E), web-address -(W), PGP key ID and fingerprint (P), description (D), and snail-mail address -(S). - -N: Intel Corporation -W: http://www.intel.com -D: Created the initial implementation. - -N: Thomas Rodgers -E: trodgers@redhat.com -D: Identifier name transformation for inclusion in a Standard C++ library. - -N: Christopher Nelson -E: nadiasvertex@gmail.com -D: Add support for an OpenMP backend. +This file is a partial list of people who have contributed to the LLVM/pstl +(Parallel STL) project. If you have contributed a patch or made some other +contribution to LLVM/pstl, please submit a patch to this file to add yourself, +and it will be done! + +The list is sorted by surname and formatted to allow easy grepping and +beautification by scripts. The fields are: name (N), email (E), web-address +(W), PGP key ID and fingerprint (P), description (D), and snail-mail address +(S). + +N: Intel Corporation +W: http://www.intel.com +D: Created the initial implementation. + +N: Thomas Rodgers +E: trodgers@redhat.com +D: Identifier name transformation for inclusion in a Standard C++ library. + +N: Christopher Nelson +E: nadiasvertex@gmail.com +D: Add support for an OpenMP backend. diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 38493411addebf145f369b8f6fd591051d0320eb..7057f5d5c5c13a08d1ee7fb79832a8b65dcce4de 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -1221,12 +1221,31 @@ cc_library( ], ) +gentbl_cc_library( + name = "DynamicLoaderMacOSXDYLDProperties", + strip_include_prefix = "DynamicLoader/MacOSX-DYLD", + tbl_outs = [ + ( + ["-gen-lldb-property-defs"], + "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.inc", + ), + ( + ["-gen-lldb-property-enum-defs"], + "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinPropertiesEnum.inc", + ), + ], + tblgen = "//lldb:lldb-tblgen", + td_file = "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td", + deps = ["//lldb:CoreTdFiles"], +) + cc_library( name = "PluginDynamicLoaderMacOSXDYLD", srcs = glob(["DynamicLoader/MacOSX-DYLD/*.cpp"]), hdrs = glob(["DynamicLoader/MacOSX-DYLD/*.h"]), include_prefix = "Plugins", deps = [ + ":DynamicLoaderMacOSXDYLDProperties", ":PluginObjCRuntime", ":PluginTypeSystemClang", ":PluginTypeSystemClangHeaders", @@ -1239,6 +1258,7 @@ cc_library( "//lldb:Target", "//lldb:TargetHeaders", "//lldb:Utility", + "//llvm:Support", "//llvm:TargetParser", ], )